Index: linux-2.6.14/arch/alpha/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/alpha/kernel/time.c +++ linux-2.6.14/arch/alpha/kernel/time.c @@ -55,10 +55,6 @@ #include "proto.h" #include "irq_impl.h" -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - extern unsigned long wall_jiffies; /* kernel/timer.c */ static int set_rtc_mmss(unsigned long); Index: linux-2.6.14/arch/arm/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/arm/kernel/time.c +++ linux-2.6.14/arch/arm/kernel/time.c @@ -36,10 +36,6 @@ #include #include -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - /* * Our system timer. */ Index: linux-2.6.14/arch/arm26/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/arm26/kernel/time.c +++ linux-2.6.14/arch/arm26/kernel/time.c @@ -34,10 +34,6 @@ #include #include -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - extern unsigned long wall_jiffies; /* this needs a better home */ Index: linux-2.6.14/arch/cris/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/cris/kernel/time.c +++ linux-2.6.14/arch/cris/kernel/time.c @@ -32,10 +32,6 @@ #include #include -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - int have_rtc; /* used to remember if we have an RTC or not */; #define TICK_SIZE tick Index: linux-2.6.14/arch/frv/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/frv/kernel/time.c +++ linux-2.6.14/arch/frv/kernel/time.c @@ -34,9 +34,6 @@ extern unsigned long wall_jiffies; -u64 jiffies_64 = INITIAL_JIFFIES; -EXPORT_SYMBOL(jiffies_64); - unsigned long __nongprelbss __clkin_clock_speed_HZ; unsigned long __nongprelbss __ext_bus_clock_speed_HZ; unsigned long __nongprelbss __res_bus_clock_speed_HZ; Index: linux-2.6.14/arch/h8300/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/h8300/kernel/time.c +++ linux-2.6.14/arch/h8300/kernel/time.c @@ -32,10 +32,6 @@ #define TICK_SIZE (tick_nsec / 1000) -u64 jiffies_64; - -EXPORT_SYMBOL(jiffies_64); - /* * timer_interrupt() needs to keep up the real-time clock, * as well as call the "do_timer()" routine every clocktick Index: linux-2.6.14/arch/i386/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/time.c +++ linux-2.6.14/arch/i386/kernel/time.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -56,6 +57,7 @@ #include #include #include +#include #include "mach_time.h" @@ -74,10 +76,6 @@ int pit_latch_buggy; /* ext #include "do_timer.h" -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - unsigned int cpu_khz; /* Detected as we calibrate the TSC */ EXPORT_SYMBOL(cpu_khz); @@ -86,13 +84,6 @@ extern unsigned long wall_jiffies; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); -#include - -DEFINE_SPINLOCK(i8253_lock); -EXPORT_SYMBOL(i8253_lock); - -struct timer_opts *cur_timer __read_mostly = &timer_none; - /* * This is a special lock that is owned by the CPU and holds the index * register we are working with. It is required for NMI access to the @@ -122,116 +113,23 @@ void rtc_cmos_write(unsigned char val, u } EXPORT_SYMBOL(rtc_cmos_write); -/* - * This version of gettimeofday has microsecond resolution - * and better than microsecond precision on fast x86 machines with TSC. - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq; - unsigned long usec, sec; - unsigned long max_ntp_tick; - - do { - unsigned long lost; - - seq = read_seqbegin(&xtime_lock); - - usec = cur_timer->get_offset(); - lost = jiffies - wall_jiffies; - - /* - * If time_adjust is negative then NTP is slowing the clock - * so make sure not to go into next possible interval. - * Better to lose some accuracy than have time go backwards.. - */ - if (unlikely(time_adjust < 0)) { - max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; - usec = min(usec, max_ntp_tick); - - if (lost) - usec += lost * max_ntp_tick; - } - else if (unlikely(lost)) - usec += lost * (USEC_PER_SEC / HZ); - - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - } while (read_seqretry(&xtime_lock, seq)); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= cur_timer->get_offset() * NSEC_PER_USEC; - nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - static int set_rtc_mmss(unsigned long nowtime) { int retval; - - WARN_ON(irqs_disabled()); + unsigned long flags; /* gets recalled with irq locally disabled */ - spin_lock_irq(&rtc_lock); + /* XXX - does irqsave resolve this? -johnstul */ + spin_lock_irqsave(&rtc_lock, flags); if (efi_enabled) retval = efi_set_rtc_mmss(nowtime); else retval = mach_set_rtc_mmss(nowtime); - spin_unlock_irq(&rtc_lock); + spin_unlock_irqrestore(&rtc_lock, flags); return retval; } - -int timer_ack; - -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -unsigned long long monotonic_clock(void) -{ - return cur_timer->monotonic_clock(); -} -EXPORT_SYMBOL(monotonic_clock); - #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) unsigned long profile_pc(struct pt_regs *regs) { @@ -245,70 +143,6 @@ unsigned long profile_pc(struct pt_regs EXPORT_SYMBOL(profile_pc); #endif -/* - * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick - */ -static inline void do_timer_interrupt(int irq, struct pt_regs *regs) -{ -#ifdef CONFIG_X86_IO_APIC - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to reset the IRR bit for do_slow_gettimeoffset(). - * This will also deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); - } -#endif - - do_timer_interrupt_hook(regs); - - - if (MCA_bus) { - /* The PS/2 uses level-triggered interrupts. You can't - turn them off, nor would you want to (any attempt to - enable edge-triggered interrupts usually gets intercepted by a - special hardware circuit). Hence we have to acknowledge - the timer interrupt. Through some incredibly stupid - design idea, the reset for IRQ 0 is done by setting the - high bit of the PPI port B (0x61). Note that some PS/2s, - notably the 55SX, work fine if this is removed. */ - - irq = inb_p( 0x61 ); /* read the current state */ - outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ - } -} - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - - cur_timer->mark_offset(); - - do_timer_interrupt(irq, regs); - - write_sequnlock(&xtime_lock); - return IRQ_HANDLED; -} - /* not static: needed by APM */ unsigned long get_cmos_time(void) { @@ -327,139 +161,42 @@ unsigned long get_cmos_time(void) } EXPORT_SYMBOL(get_cmos_time); -static void sync_cmos_clock(unsigned long dummy); - -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); - -static void sync_cmos_clock(unsigned long dummy) +/* arch specific timeofday hooks */ +nsec_t read_persistent_clock(void) { - struct timeval now, next; - int fail = 1; + return (nsec_t)get_cmos_time() * NSEC_PER_SEC; +} +void sync_persistent_clock(struct timespec ts) +{ + static unsigned long last_rtc_update; /* * If we have an externally synchronized Linux clock, then update * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... */ - if (!ntp_synced()) - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ + if (ts.tv_sec <= last_rtc_update + 660) return; - do_gettimeofday(&now); - if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && - now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) - fail = set_rtc_mmss(now.tv_sec); - - next.tv_usec = USEC_AFTER - now.tv_usec; - if (next.tv_usec <= 0) - next.tv_usec += USEC_PER_SEC; - - if (!fail) - next.tv_sec = 659; - else - next.tv_sec = 0; - - if (next.tv_usec >= USEC_PER_SEC) { - next.tv_sec++; - next.tv_usec -= USEC_PER_SEC; + if((ts.tv_nsec / 1000) >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && + (ts.tv_nsec / 1000) <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) { + /* horrible...FIXME */ + if (set_rtc_mmss(ts.tv_sec) == 0) + last_rtc_update = ts.tv_sec; + else + last_rtc_update = ts.tv_sec - 600; /* do it again in 60 s */ } - mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); -} - -void notify_arch_cmos_timer(void) -{ - mod_timer(&sync_cmos_timer, jiffies + 1); } -static long clock_cmos_diff, sleep_start; - -static struct timer_opts *last_timer; -static int timer_suspend(struct sys_device *dev, pm_message_t state) -{ - /* - * Estimate time zone so that set_time can update the clock - */ - clock_cmos_diff = -get_cmos_time(); - clock_cmos_diff += get_seconds(); - sleep_start = get_cmos_time(); - last_timer = cur_timer; - cur_timer = &timer_none; - if (last_timer->suspend) - last_timer->suspend(state); - return 0; -} - -static int timer_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long sec; - unsigned long sleep_length; - -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) - hpet_reenable(); -#endif - setup_pit_timer(); - sec = get_cmos_time() + clock_cmos_diff; - sleep_length = (get_cmos_time() - sleep_start) * HZ; - write_seqlock_irqsave(&xtime_lock, flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); - jiffies += sleep_length; - wall_jiffies += sleep_length; - if (last_timer->resume) - last_timer->resume(); - cur_timer = last_timer; - last_timer = NULL; - touch_softlockup_watchdog(); - return 0; -} - -static struct sysdev_class timer_sysclass = { - .resume = timer_resume, - .suspend = timer_suspend, - set_kset_name("timer"), -}; - - -/* XXX this driverfs stuff should probably go elsewhere later -john */ -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int time_init_device(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(time_init_device); - #ifdef CONFIG_HPET_TIMER extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ static void __init hpet_time_init(void) { - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - if ((hpet_enable() >= 0) && hpet_use_timer) { printk("Using HPET for base-timer\n"); } - cur_timer = select_timer(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); time_init_hook(); } @@ -467,6 +204,9 @@ static void __init hpet_time_init(void) void __init time_init(void) { + /* Set the clock to HZ Hz: */ + setup_pit_timer(); + #ifdef CONFIG_HPET_TIMER if (is_hpet_capable()) { /* @@ -477,13 +217,5 @@ void __init time_init(void) return; } #endif - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - - cur_timer = select_timer(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); - time_init_hook(); } Index: linux-2.6.14/arch/ia64/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/ia64/kernel/time.c +++ linux-2.6.14/arch/ia64/kernel/time.c @@ -32,10 +32,6 @@ extern unsigned long wall_jiffies; -u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - #define TIME_KEEPER_ID 0 /* smp_processor_id() of time-keeper */ #ifdef CONFIG_IA64_DEBUG_IRQ Index: linux-2.6.14/arch/m32r/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/m32r/kernel/time.c +++ linux-2.6.14/arch/m32r/kernel/time.c @@ -39,10 +39,6 @@ extern void send_IPI_allbutself(int, int extern void smp_local_timer_interrupt(struct pt_regs *); #endif -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - extern unsigned long wall_jiffies; #define TICK_SIZE (tick_nsec / 1000) Index: linux-2.6.14/arch/m68k/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/m68k/kernel/time.c +++ linux-2.6.14/arch/m68k/kernel/time.c @@ -27,10 +27,6 @@ #include #include -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - static inline int set_rtc_mmss(unsigned long nowtime) { if (mach_set_clock_mmss) Index: linux-2.6.14/arch/m68knommu/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/m68knommu/kernel/time.c +++ linux-2.6.14/arch/m68knommu/kernel/time.c @@ -27,10 +27,6 @@ #define TICK_SIZE (tick_nsec / 1000) -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - extern unsigned long wall_jiffies; Index: linux-2.6.14/arch/mips/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/mips/kernel/time.c +++ linux-2.6.14/arch/mips/kernel/time.c @@ -43,10 +43,6 @@ #define TICK_SIZE (tick_nsec / 1000) -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - /* * forward reference */ Index: linux-2.6.14/arch/parisc/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/parisc/kernel/time.c +++ linux-2.6.14/arch/parisc/kernel/time.c @@ -33,10 +33,6 @@ #include -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - /* xtime and wall_jiffies keep wall-clock time */ extern unsigned long wall_jiffies; Index: linux-2.6.14/arch/ppc/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/ppc/kernel/time.c +++ linux-2.6.14/arch/ppc/kernel/time.c @@ -66,11 +66,6 @@ #include -/* XXX false sharing with below? */ -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - unsigned long disarm_decr[NR_CPUS]; extern struct timezone sys_tz; Index: linux-2.6.14/arch/ppc64/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/ppc64/kernel/time.c +++ linux-2.6.14/arch/ppc64/kernel/time.c @@ -68,10 +68,6 @@ #include #include -u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - /* keep track of when we need to update the rtc */ time_t last_rtc_update; extern int piranha_simulator; Index: linux-2.6.14/arch/s390/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/s390/kernel/time.c +++ linux-2.6.14/arch/s390/kernel/time.c @@ -49,10 +49,6 @@ #define TICK_SIZE tick -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - static ext_int_info_t ext_int_info_cc; static u64 init_timer_cc; static u64 jiffies_timer_cc; Index: linux-2.6.14/arch/sh/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/sh/kernel/time.c +++ linux-2.6.14/arch/sh/kernel/time.c @@ -56,10 +56,6 @@ extern unsigned long wall_jiffies; #define TICK_SIZE (tick_nsec / 1000) DEFINE_SPINLOCK(tmu0_lock); -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - /* XXX: Can we initialize this in a routine somewhere? Dreamcast doesn't want * these routines anywhere... */ #ifdef CONFIG_SH_RTC Index: linux-2.6.14/arch/sh64/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/sh64/kernel/time.c +++ linux-2.6.14/arch/sh64/kernel/time.c @@ -116,8 +116,6 @@ extern unsigned long wall_jiffies; -u64 jiffies_64 = INITIAL_JIFFIES; - static unsigned long tmu_base, rtc_base; unsigned long cprc_base; Index: linux-2.6.14/arch/sparc/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/sparc/kernel/time.c +++ linux-2.6.14/arch/sparc/kernel/time.c @@ -45,10 +45,6 @@ extern unsigned long wall_jiffies; -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - DEFINE_SPINLOCK(rtc_lock); enum sparc_clock_type sp_clock_typ; DEFINE_SPINLOCK(mostek_lock); Index: linux-2.6.14/arch/sparc64/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/sparc64/kernel/time.c +++ linux-2.6.14/arch/sparc64/kernel/time.c @@ -55,10 +55,6 @@ unsigned long ds1287_regs = 0UL; extern unsigned long wall_jiffies; -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - static void __iomem *mstk48t08_regs; static void __iomem *mstk48t59_regs; Index: linux-2.6.14/arch/um/kernel/time_kern.c =================================================================== --- linux-2.6.14.orig/arch/um/kernel/time_kern.c +++ linux-2.6.14/arch/um/kernel/time_kern.c @@ -22,10 +22,6 @@ #include "mode.h" #include "os.h" -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - int hz(void) { return(HZ); Index: linux-2.6.14/arch/v850/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/v850/kernel/time.c +++ linux-2.6.14/arch/v850/kernel/time.c @@ -26,10 +26,6 @@ #include "mach.h" -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - #define TICK_SIZE (tick_nsec / 1000) /* Index: linux-2.6.14/arch/x86_64/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/x86_64/kernel/time.c +++ linux-2.6.14/arch/x86_64/kernel/time.c @@ -38,17 +38,11 @@ #include #include #include +#include #ifdef CONFIG_X86_LOCAL_APIC #include #endif -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - -#ifdef CONFIG_CPU_FREQ -static void cpufreq_delayed_get(void); -#endif extern void i8254_timer_resume(void); extern int using_apic_timer; @@ -60,7 +54,9 @@ static int notsc __initdata = 0; #undef HPET_HACK_ENABLE_DANGEROUS -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +unsigned int cpu_khz; /* CPU clocks / usec, not used here */ +unsigned int tsc_khz; /* TSC clocks / usec, not used here */ +unsigned long hpet_address; static unsigned long hpet_period; /* fsecs / HPET clock */ unsigned long hpet_tick; /* HPET clocks / interrupt */ static int hpet_use_timer; @@ -83,107 +79,6 @@ static inline void rdtscll_sync(unsigned rdtscll(*tsc); } -/* - * do_gettimeoffset() returns microseconds since last timer interrupt was - * triggered by hardware. A memory read of HPET is slower than a register read - * of TSC, but much more reliable. It's also synchronized to the timer - * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a - * timer interrupt has happened already, but vxtime.trigger wasn't updated yet. - * This is not a problem, because jiffies hasn't updated either. They are bound - * together by xtime_lock. - */ - -static inline unsigned int do_gettimeoffset_tsc(void) -{ - unsigned long t; - unsigned long x; - rdtscll_sync(&t); - if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; - return x; -} - -static inline unsigned int do_gettimeoffset_hpet(void) -{ - /* cap counter read to one tick to avoid inconsistencies */ - unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; - return (min(counter,hpet_tick) * vxtime.quot) >> 32; -} - -unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; - -/* - * This version of gettimeofday() has microsecond resolution and better than - * microsecond precision, as we're using at least a 10 MHz (usually 14.31818 - * MHz) HPET timer. - */ - -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq, t; - unsigned int sec, usec; - - do { - seq = read_seqbegin(&xtime_lock); - - sec = xtime.tv_sec; - usec = xtime.tv_nsec / 1000; - - /* i386 does some correction here to keep the clock - monotonous even when ntpd is fixing drift. - But they didn't work for me, there is a non monotonic - clock anyways with ntp. - I dropped all corrections now until a real solution can - be found. Note when you fix it here you need to do the same - in arch/x86_64/kernel/vsyscall.c and export all needed - variables in vmlinux.lds. -AK */ - - t = (jiffies - wall_jiffies) * (1000000L / HZ) + - do_gettimeoffset(); - usec += t; - - } while (read_seqretry(&xtime_lock, seq)); - - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; -} - -EXPORT_SYMBOL(do_gettimeofday); - -/* - * settimeofday() first undoes the correction that gettimeofday would do - * on the time, and then saves it. This is ugly, but has been like this for - * ages already. - */ - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - - nsec -= do_gettimeoffset() * 1000 + - (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -283,90 +178,8 @@ static void set_rtc_mmss(unsigned long n spin_unlock(&rtc_lock); } - -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -unsigned long long monotonic_clock(void) -{ - unsigned long seq; - u32 last_offset, this_offset, offset; - unsigned long long base; - - if (vxtime.mode == VXTIME_HPET) { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last; - base = monotonic_base; - this_offset = hpet_readl(HPET_COUNTER); - - } while (read_seqretry(&xtime_lock, seq)); - offset = (this_offset - last_offset); - offset *=(NSEC_PER_SEC/HZ)/hpet_tick; - return base + offset; - }else{ - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last_tsc; - base = monotonic_base; - } while (read_seqretry(&xtime_lock, seq)); - sync_core(); - rdtscll(this_offset); - offset = (this_offset - last_offset)*1000/cpu_khz; - return base + offset; - } - - -} -EXPORT_SYMBOL(monotonic_clock); - -static noinline void handle_lost_ticks(int lost, struct pt_regs *regs) -{ - static long lost_count; - static int warned; - - if (report_lost_ticks) { - printk(KERN_WARNING "time.c: Lost %d timer " - "tick(s)! ", lost); - print_symbol("rip %s)\n", regs->rip); - } - - if (lost_count == 1000 && !warned) { - printk(KERN_WARNING - "warning: many lost ticks.\n" - KERN_WARNING "Your time source seems to be instable or " - "some driver is hogging interupts\n"); - print_symbol("rip %s\n", regs->rip); - if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) { - printk(KERN_WARNING "Falling back to HPET\n"); - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; - } - /* else should fall back to PIT, but code missing. */ - warned = 1; - } else - lost_count++; - -#ifdef CONFIG_CPU_FREQ - /* In some cases the CPU can change frequency without us noticing - (like going into thermal throttle) - Give cpufreq a change to catch up. */ - if ((lost_count+1) % 25 == 0) { - cpufreq_delayed_get(); - } -#endif -} - static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - static unsigned long rtc_update = 0; - unsigned long tsc; - int delay, offset = 0, lost = 0; - /* * Here we are in the timer irq handler. We have irqs locally disabled (so we * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running @@ -376,67 +189,6 @@ static irqreturn_t timer_interrupt(int i write_seqlock(&xtime_lock); - if (vxtime.hpet_address) - offset = hpet_readl(HPET_COUNTER); - - if (hpet_use_timer) { - /* if we're using the hpet timer functionality, - * we can more accurately know the counter value - * when the timer interrupt occured. - */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - delay = hpet_readl(HPET_COUNTER) - offset; - } else { - spin_lock(&i8253_lock); - outb_p(0x00, 0x43); - delay = inb_p(0x40); - delay |= inb(0x40) << 8; - spin_unlock(&i8253_lock); - delay = LATCH - 1 - delay; - } - - rdtscll_sync(&tsc); - - if (vxtime.mode == VXTIME_HPET) { - if (offset - vxtime.last > hpet_tick) { - lost = (offset - vxtime.last) / hpet_tick - 1; - } - - monotonic_base += - (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; - - vxtime.last = offset; -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - lost = pmtimer_mark_offset(); -#endif - } else { - offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); - - if (offset < 0) - offset = 0; - - if (offset > (USEC_PER_SEC / HZ)) { - lost = offset / (USEC_PER_SEC / HZ); - offset %= (USEC_PER_SEC / HZ); - } - - monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ; - - vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; - - if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) < offset) - vxtime.last_tsc = tsc - - (((long) offset << 32) / vxtime.tsc_quot) - 1; - } - - if (lost > 0) { - handle_lost_ticks(lost, regs); - jiffies += lost; - } - /* * Do the timer stuff. */ @@ -459,20 +211,6 @@ static irqreturn_t timer_interrupt(int i smp_local_timer_interrupt(regs); #endif -/* - * If we have an externally synchronized Linux clock, then update CMOS clock - * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy - * closest to exactly 500 ms before the next second. If the update fails, we - * don't care, as it'll be updated on the next turn, and the problem (time way - * off) isn't likely to go away much sooner anyway. - */ - - if (ntp_synced() && xtime.tv_sec > rtc_update && - abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) { - set_rtc_mmss(xtime.tv_sec); - rtc_update = xtime.tv_sec + 660; - } - write_sequnlock(&xtime_lock); return IRQ_HANDLED; @@ -573,6 +311,30 @@ unsigned long get_cmos_time(void) return mktime(year, mon, day, hour, min, sec); } +/* arch specific timeofday hooks */ +u64 read_persistent_clock(void) +{ + return (u64)get_cmos_time() * NSEC_PER_SEC; +} + +void sync_persistent_clock(struct timespec ts) +{ + static unsigned long rtc_update = 0; + /* + * If we have an externally synchronized Linux clock, then update + * CMOS clock accordingly every ~11 minutes. set_rtc_mmss() will + * be called in the jiffy closest to exactly 500 ms before the + * next second. If the update fails, we don't care, as it'll be + * updated on the next turn, and the problem (time way off) isn't + * likely to go away much sooner anyway. + */ + if (ts.tv_sec > rtc_update && + abs(ts.tv_nsec - 500000000) <= tick_nsec / 2) { + set_rtc_mmss(xtime.tv_sec); + rtc_update = xtime.tv_sec + 660; + } +} + #ifdef CONFIG_CPU_FREQ /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency @@ -600,23 +362,6 @@ static void handle_cpufreq_delayed_get(v cpufreq_delayed_issched = 0; } -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static void cpufreq_delayed_get(void) -{ - static int warned; - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - if (!warned) { - warned = 1; - printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); - } - schedule_work(&cpufreq_delayed_get_work); - } -} - static unsigned int ref_freq = 0; static unsigned long loops_per_jiffy_ref = 0; @@ -651,8 +396,11 @@ static int time_cpufreq_notifier(struct cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { vxtime.tsc_quot = (1000L << 32) / cpu_khz; + tsc_khz = cpu_khz; + } + } set_cyc2ns_scale(cpu_khz_ref / 1000); @@ -916,18 +664,12 @@ void __init time_init(void) if (hpet_use_timer) { cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; -#ifdef CONFIG_X86_PM_TIMER - } else if (pmtmr_ioport) { - vxtime_hz = PM_TIMER_FREQUENCY; - timename = "PM"; - pit_init(); - cpu_khz = pit_calibrate_tsc(); -#endif } else { pit_init(); cpu_khz = pit_calibrate_tsc(); timename = "PIT"; } + tsc_khz = cpu_khz; printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n", vxtime_hz / 1000000, vxtime_hz % 1000000, timename); @@ -969,31 +711,8 @@ static __init int unsynchronized_tsc(voi */ void __init time_init_gtod(void) { - char *timetype; - if (unsynchronized_tsc()) notsc = 1; - if (vxtime.hpet_address && notsc) { - timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; -#ifdef CONFIG_X86_PM_TIMER - /* Using PM for gettimeofday is quite slow, but we have no other - choice because the TSC is too unreliable on some systems. */ - } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) { - timetype = "PM"; - do_gettimeoffset = do_gettimeoffset_pm; - vxtime.mode = VXTIME_PMTMR; - sysctl_vsyscall = 0; - printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); -#endif - } else { - timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; - vxtime.mode = VXTIME_TSC; - } - - printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype); } __setup("report_lost_ticks", time_setup); @@ -1016,7 +735,6 @@ static int timer_suspend(struct sys_devi static int timer_resume(struct sys_device *dev) { - unsigned long flags; unsigned long sec; unsigned long ctime = get_cmos_time(); unsigned long sleep_length = (ctime - sleep_start) * HZ; @@ -1027,10 +745,6 @@ static int timer_resume(struct sys_devic i8254_timer_resume(); sec = ctime + clock_cmos_diff; - write_seqlock_irqsave(&xtime_lock,flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - write_sequnlock_irqrestore(&xtime_lock,flags); jiffies += sleep_length; wall_jiffies += sleep_length; touch_softlockup_watchdog(); @@ -1305,3 +1019,93 @@ static int __init notsc_setup(char *s) __setup("notsc", notsc_setup); +/* Code to compensate for TSC C3 stalls */ +static u64 tsc_c3_offset; +static int tsc_unstable; +static inline int check_tsc_unstable(void) +{ + return tsc_unstable; +} +static inline void mark_tsc_unstable(void) +{ + tsc_unstable = 1;; +} + +void tsc_c3_compensate(unsigned long nsecs) +{ + u64 cycles = ((u64)nsecs * tsc_khz)/1000000; + tsc_c3_offset += cycles; +} + +static inline u64 tsc_read_c3_time(void) +{ + return tsc_c3_offset; +} + + +/* Clock source code */ +#include + +static unsigned long current_tsc_khz = 0; +static cycle_t read_tsc_c3(void); +static int tsc_update_callback(void); + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .type = CLOCKSOURCE_CYCLES, + .mask = (cycle_t)-1, + .mult = 0, /* to be set */ + .shift = 22, + .update_callback = tsc_update_callback, + .is_continuous = 1, +}; + +static cycle_t read_tsc_c3(void) +{ + cycle_t ret; + rdtscll(ret); + return ret + tsc_read_c3_time(); +} + +static int tsc_update_callback(void) +{ + int change = 0; + /* check to see if we should switch to the safe clocksource */ + if (tsc_read_c3_time() && + strncmp(clocksource_tsc.name, "c3tsc", 5)) { + printk("Falling back to C3 safe TSC\n"); + clocksource_tsc.read_fnct = read_tsc_c3; + clocksource_tsc.type = CLOCKSOURCE_FUNCTION; + clocksource_tsc.name = "c3tsc"; + change = 1; + } + + if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { + clocksource_tsc.rating = 50; + reselect_clocksource(); + change = 1; + } + /* only update if tsc_khz has changed */ + if (current_tsc_khz != tsc_khz){ + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + change = 1; + } + return change; +} + +static int __init init_tsc_clocksource(void) +{ + if (!notsc && tsc_khz) { + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + register_clocksource(&clocksource_tsc); + } + return 0; +} + +module_init(init_tsc_clocksource); + Index: linux-2.6.14/arch/xtensa/kernel/time.c =================================================================== --- linux-2.6.14.orig/arch/xtensa/kernel/time.c +++ linux-2.6.14/arch/xtensa/kernel/time.c @@ -29,9 +29,6 @@ extern volatile unsigned long wall_jiffies; -u64 jiffies_64 = INITIAL_JIFFIES; -EXPORT_SYMBOL(jiffies_64); - spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; EXPORT_SYMBOL(rtc_lock); Index: linux-2.6.14/kernel/timer.c =================================================================== --- linux-2.6.14.orig/kernel/timer.c +++ linux-2.6.14/kernel/timer.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include @@ -46,6 +46,10 @@ static void time_interpolator_update(lon #define time_interpolator_update(x) #endif +u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + /* * per-CPU timer vector definitions: */ @@ -613,13 +617,94 @@ long time_tolerance = MAXFREQ; /* frequ long time_precision = 1; /* clock precision (us) */ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -static long time_phase; /* phase offset (scaled us) */ long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; /* frequency offset (scaled ppm)*/ static long time_adj; /* tick adjust (scaled 1 / HZ) */ long time_reftime; /* time at last adjustment (s) */ long time_adjust; long time_next_adjust; +long time_adjust_step; /* per tick time_adjust step */ + +long total_sppm; /* shifted ppm sum of all NTP adjustments */ +long offset_adj_ppm; +long tick_adj_ppm; +long singleshot_adj_ppm; + +#define MAX_SINGLESHOT_ADJ 500 /* (ppm) */ +#define SEC_PER_DAY 86400 +#define END_OF_DAY(x) (x + SEC_PER_DAY - (x % SEC_PER_DAY) - 1) + +/* NTP lock, protects NTP state machine */ +seqlock_t ntp_lock = SEQLOCK_UNLOCKED; + +/** + * ntp_leapsecond - NTP leapsecond processing code. + * now: the current time + * + * Returns the number of seconds (-1, 0, or 1) that + * should be added to the current time to properly + * adjust for leapseconds. + */ + +int ntp_leapsecond(struct timespec now) +{ + unsigned long flags; + /* + * Leap second processing. If in leap-insert state at + * the end of the day, the system clock is set back one + * second; if in leap-delete state, the system clock is + * set ahead one second. + */ + static time_t leaptime = 0; + int ret = 0; + + write_seqlock_irqsave(&ntp_lock, flags); + switch (time_state) { + + case TIME_OK: + if (time_status & STA_INS) { + time_state = TIME_INS; + leaptime = END_OF_DAY(now.tv_sec); + } else if (time_status & STA_DEL) { + time_state = TIME_DEL; + leaptime = END_OF_DAY(now.tv_sec); + } + break; + + case TIME_INS: + /* Once we are at (or past) leaptime, insert the second */ + if (now.tv_sec >= leaptime) { + time_state = TIME_OOP; + printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); + ret = -1; + } + break; + + case TIME_DEL: + /* Once we are at (or past) leaptime, delete the second */ + if (now.tv_sec >= leaptime) { + time_state = TIME_WAIT; + printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); + ret = 1; + } + break; + + case TIME_OOP: + /* Wait for the end of the leap second*/ + if (now.tv_sec > (leaptime + 1)) + time_state = TIME_WAIT; + time_state = TIME_WAIT; + break; + + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + break; + } + + write_sequnlock_irqrestore(&ntp_lock, flags); + return 0; +} /* * this routine handles the overflow of the microsecond field @@ -642,59 +727,6 @@ static void second_overflow(void) } /* - * Leap second processing. If in leap-insert state at - * the end of the day, the system clock is set back one - * second; if in leap-delete state, the system clock is - * set ahead one second. The microtime() routine or - * external clock driver will insure that reported time - * is always monotonic. The ugly divides should be - * replaced. - */ - switch (time_state) { - - case TIME_OK: - if (time_status & STA_INS) - time_state = TIME_INS; - else if (time_status & STA_DEL) - time_state = TIME_DEL; - break; - - case TIME_INS: - if (xtime.tv_sec % 86400 == 0) { - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; - /* The timer interpolator will make time change gradually instead - * of an immediate jump by one second. - */ - time_interpolator_update(-NSEC_PER_SEC); - time_state = TIME_OOP; - clock_was_set(); - printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); - } - break; - - case TIME_DEL: - if ((xtime.tv_sec + 1) % 86400 == 0) { - xtime.tv_sec++; - wall_to_monotonic.tv_sec--; - /* Use of time interpolator for a gradual change of time */ - time_interpolator_update(NSEC_PER_SEC); - time_state = TIME_WAIT; - clock_was_set(); - printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); - } - break; - - case TIME_OOP: - time_state = TIME_WAIT; - break; - - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - } - - /* * Compute the phase adjustment for the next second. In * PLL mode, the offset is reduced by a fixed factor * times the time constant. In FLL mode the offset is @@ -703,23 +735,20 @@ static void second_overflow(void) * the adjustment over not more than the number of * seconds between updates. */ - if (time_offset < 0) { - ltemp = -time_offset; - if (!(time_status & STA_FLL)) - ltemp >>= SHIFT_KG + time_constant; - if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) - ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; - time_offset += ltemp; - time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - } else { ltemp = time_offset; if (!(time_status & STA_FLL)) - ltemp >>= SHIFT_KG + time_constant; - if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) - ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + ltemp = shift_right(ltemp, SHIFT_KG + time_constant); + ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); + ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); time_offset -= ltemp; time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - } + + offset_adj_ppm = shift_right(ltemp, SHIFT_UPDATE); /* ppm */ + + /* first calculate usec/user_tick offset */ + tick_adj_ppm = ((USEC_PER_SEC + USER_HZ/2)/USER_HZ) - tick_usec; + /* multiply by user_hz to get usec/sec => ppm */ + tick_adj_ppm *= USER_HZ; /* * Compute the frequency estimate and additional phase @@ -736,82 +765,90 @@ static void second_overflow(void) STA_PPSWANDER | STA_PPSERROR); } ltemp = time_freq + pps_freq; - if (ltemp < 0) - time_adj -= -ltemp >> - (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); - else - time_adj += ltemp >> - (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); #if HZ == 100 /* Compensate for (HZ==100) != (1 << SHIFT_HZ). * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) */ - if (time_adj < 0) - time_adj -= (-time_adj >> 2) + (-time_adj >> 5); - else - time_adj += (time_adj >> 2) + (time_adj >> 5); + time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); #endif #if HZ == 1000 /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) */ - if (time_adj < 0) - time_adj -= (-time_adj >> 6) + (-time_adj >> 7); - else - time_adj += (time_adj >> 6) + (time_adj >> 7); + time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); #endif } -/* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) + +/** + * ntp_get_ppm_adjustment - Returns Shifted PPM adjustment + * + */ +long ntp_get_ppm_adjustment(void) +{ + return total_sppm; +} + +/** + * ntp_advance() - increments the NTP state machine + * + */ +void ntp_advance(unsigned long interval_ns) { - long time_adjust_step, delta_nsec; + static unsigned long interval_sum; + unsigned long flags; + write_seqlock_irqsave(&ntp_lock, flags); - if ( (time_adjust_step = time_adjust) != 0 ) { - /* We are doing an adjtime thing. - * - * Prepare time_adjust_step to be within bounds. - * Note that a positive time_adjust means we want the clock - * to run faster. - * - * Limit the amount of the step to be in the range - * -tickadj .. +tickadj - */ - if (time_adjust > tickadj) - time_adjust_step = tickadj; - else if (time_adjust < -tickadj) - time_adjust_step = -tickadj; + /* increment the interval sum */ + interval_sum += interval_ns; - /* Reduce by this step the amount of time left */ - time_adjust -= time_adjust_step; - } - delta_nsec = tick_nsec + time_adjust_step * 1000; - /* - * Advance the phase, once it gets to one microsecond, then - * advance the tick more. - */ - time_phase += time_adj; - if (time_phase <= -FINENSEC) { - long ltemp = -time_phase >> (SHIFT_SCALE - 10); - time_phase += ltemp << (SHIFT_SCALE - 10); - delta_nsec -= ltemp; - } - else if (time_phase >= FINENSEC) { - long ltemp = time_phase >> (SHIFT_SCALE - 10); - time_phase -= ltemp << (SHIFT_SCALE - 10); - delta_nsec += ltemp; + /* calculate the per tick singleshot adjtime adjustment step */ + while (interval_ns >= tick_nsec) { + time_adjust_step = time_adjust; + if (time_adjust_step) { + /* We are doing an adjtime thing. + * + * Prepare time_adjust_step to be within bounds. + * Note that a positive time_adjust means we want the clock + * to run faster. + * + * Limit the amount of the step to be in the range + * -tickadj .. +tickadj + */ + time_adjust_step = min(time_adjust_step, (long)tickadj); + time_adjust_step = max(time_adjust_step, (long)-tickadj); + + /* Reduce by this step the amount of time left */ + time_adjust -= time_adjust_step; + } + interval_ns -= tick_nsec; } - xtime.tv_nsec += delta_nsec; - time_interpolator_update(delta_nsec); + singleshot_adj_ppm = time_adjust_step*(1000000/HZ); /* usec/tick => ppm */ /* Changes by adjtime() do not take effect till next tick. */ if (time_next_adjust != 0) { time_adjust = time_next_adjust; time_next_adjust = 0; } + + while (interval_sum >= NSEC_PER_SEC) { + interval_sum -= NSEC_PER_SEC; + second_overflow(); + } + + /* calculate the total continuous ppm adjustment */ + total_sppm = time_freq; /* already shifted by SHIFT_USEC */ + total_sppm += offset_adj_ppm << SHIFT_USEC; + total_sppm += tick_adj_ppm << SHIFT_USEC; + total_sppm += singleshot_adj_ppm << SHIFT_USEC; + + write_sequnlock_irqrestore(&ntp_lock, flags); + } +#ifndef CONFIG_GENERIC_TIME /* * Using a loop looks inefficient, but "ticks" is * usually just one (we shouldn't be losing ticks, @@ -821,16 +858,51 @@ static void update_wall_time_one_tick(vo */ static void update_wall_time(unsigned long ticks) { + long delta_nsec; + static long time_phase; /* phase offset (scaled us) */ + do { ticks--; - update_wall_time_one_tick(); - if (xtime.tv_nsec >= 1000000000) { - xtime.tv_nsec -= 1000000000; + + /* Calculate the nsec delta using the + * precomputed NTP adjustments: + * tick_nsec, time_adjust_step, time_adj + */ + delta_nsec = tick_nsec + time_adjust_step * 1000; + /* + * Advance the phase, once it gets to one microsecond, then + * advance the tick more. + */ + time_phase += time_adj; + if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { + long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); + time_phase -= ltemp << (SHIFT_SCALE - 10); + delta_nsec += ltemp; + } + + xtime.tv_nsec += delta_nsec; + if (xtime.tv_nsec >= NSEC_PER_SEC) { + int leapsecond; + xtime.tv_nsec -= NSEC_PER_SEC; xtime.tv_sec++; - second_overflow(); + /* process leapsecond */ + leapsecond = ntp_leapsecond(xtime); + if (leapsecond) { + xtime.tv_sec += leapsecond; + wall_to_monotonic.tv_sec -= leapsecond; + /* Use of time interpolator for a gradual change of time */ + time_interpolator_update(leapsecond*NSEC_PER_SEC); + clock_was_set(); + } } + ntp_advance(tick_nsec); + time_interpolator_update(delta_nsec); + } while (ticks); } +#else /* !CONFIG_GENERIC_TIME */ +#define update_wall_time(x) +#endif /* !CONFIG_GENERIC_TIME */ /* * Called from the timer interrupt handler to charge one tick to the current @@ -912,6 +984,7 @@ static void run_timer_softirq(struct sof { tvec_base_t *base = &__get_cpu_var(tvec_bases); + ktimer_run_queues(); if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); } @@ -1177,62 +1250,6 @@ asmlinkage long sys_gettid(void) return current->pid; } -static long __sched nanosleep_restart(struct restart_block *restart) -{ - unsigned long expire = restart->arg0, now = jiffies; - struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; - long ret; - - /* Did it expire while we handled signals? */ - if (!time_after(expire, now)) - return 0; - - expire = schedule_timeout_interruptible(expire - now); - - ret = 0; - if (expire) { - struct timespec t; - jiffies_to_timespec(expire, &t); - - ret = -ERESTART_RESTARTBLOCK; - if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) - ret = -EFAULT; - /* The 'restart' block is already filled in */ - } - return ret; -} - -asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) -{ - struct timespec t; - unsigned long expire; - long ret; - - if (copy_from_user(&t, rqtp, sizeof(t))) - return -EFAULT; - - if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) - return -EINVAL; - - expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - expire = schedule_timeout_interruptible(expire); - - ret = 0; - if (expire) { - struct restart_block *restart; - jiffies_to_timespec(expire, &t); - if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) - return -EFAULT; - - restart = ¤t_thread_info()->restart_block; - restart->fn = nanosleep_restart; - restart->arg0 = jiffies + expire; - restart->arg1 = (unsigned long) rmtp; - ret = -ERESTART_RESTARTBLOCK; - } - return ret; -} - /* * sys_sysinfo - fill in sysinfo struct */ Index: linux-2.6.14/fs/exec.c =================================================================== --- linux-2.6.14.orig/fs/exec.c +++ linux-2.6.14/fs/exec.c @@ -645,9 +645,12 @@ static inline int de_thread(struct task_ * synchronize with any firing (by calling del_timer_sync) * before we can safely let the old group leader die. */ - sig->real_timer.data = (unsigned long)current; - if (del_timer_sync(&sig->real_timer)) - add_timer(&sig->real_timer); + sig->real_timer.data = current; + spin_unlock_irq(lock); + if (ktimer_cancel(&sig->real_timer)) + ktimer_start(&sig->real_timer, NULL, + KTIMER_RESTART|KTIMER_NOCHECK); + spin_lock_irq(lock); } while (atomic_read(&sig->count) > count) { sig->group_exit_task = current; @@ -659,7 +662,6 @@ static inline int de_thread(struct task_ } sig->group_exit_task = NULL; sig->notify_count = 0; - sig->real_timer.data = (unsigned long)current; spin_unlock_irq(lock); /* Index: linux-2.6.14/kernel/posix-timers.c =================================================================== --- linux-2.6.14.orig/kernel/posix-timers.c +++ linux-2.6.14/kernel/posix-timers.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include @@ -48,21 +48,6 @@ #include #include -#ifndef div_long_long_rem -#include - -#define div_long_long_rem(dividend,divisor,remainder) ({ \ - u64 result = dividend; \ - *remainder = do_div(result,divisor); \ - result; }) - -#endif -#define CLOCK_REALTIME_RES TICK_NSEC /* In nano seconds. */ - -static inline u64 mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2) -{ - return (u64)mpy1 * mpy2; -} /* * Management arrays for POSIX timers. Timers are kept in slab memory * Timer ids are allocated by an external routine that keeps track of the @@ -148,18 +133,18 @@ static DEFINE_SPINLOCK(idr_lock); */ static struct k_clock posix_clocks[MAX_CLOCKS]; + /* - * We only have one real clock that can be set so we need only one abs list, - * even if we should want to have several clocks with differing resolutions. + * These ones are defined below. */ -static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list), - .lock = SPIN_LOCK_UNLOCKED}; +static int common_nsleep(clockid_t, int flags, struct timespec *t, + struct timespec __user *rmtp); +static void common_timer_get(struct k_itimer *, struct itimerspec *); +static int common_timer_set(struct k_itimer *, int, + struct itimerspec *, struct itimerspec *); +static int common_timer_del(struct k_itimer *timer); -static void posix_timer_fn(unsigned long); -static u64 do_posix_clock_monotonic_gettime_parts( - struct timespec *tp, struct timespec *mo); -int do_posix_clock_monotonic_gettime(struct timespec *tp); -static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp); +static void posix_timer_fn(void *data); static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); @@ -205,21 +190,25 @@ static inline int common_clock_set(clock static inline int common_timer_create(struct k_itimer *new_timer) { - INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry); - init_timer(&new_timer->it.real.timer); - new_timer->it.real.timer.data = (unsigned long) new_timer; + return -EINVAL; +} + +static int timer_create_mono(struct k_itimer *new_timer) +{ + ktimer_init(&new_timer->it.real.timer); + new_timer->it.real.timer.data = new_timer; + new_timer->it.real.timer.function = posix_timer_fn; + return 0; +} + +static int timer_create_real(struct k_itimer *new_timer) +{ + ktimer_init_real(&new_timer->it.real.timer); + new_timer->it.real.timer.data = new_timer; new_timer->it.real.timer.function = posix_timer_fn; return 0; } -/* - * These ones are defined below. - */ -static int common_nsleep(clockid_t, int flags, struct timespec *t); -static void common_timer_get(struct k_itimer *, struct itimerspec *); -static int common_timer_set(struct k_itimer *, int, - struct itimerspec *, struct itimerspec *); -static int common_timer_del(struct k_itimer *timer); /* * Return nonzero iff we know a priori this clockid_t value is bogus. @@ -239,19 +228,44 @@ static inline int invalid_clockid(clocki return 1; } +/* + * Get real time for posix timers + */ +static int posix_ktime_get_real_ts(clockid_t which_clock, struct timespec *tp) +{ + ktime_get_real_ts(tp); + return 0; +} + +/* + * Get monotonic time for posix timers + */ +static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) +{ + ktime_get_ts(tp); + return 0; +} + +void do_posix_clock_monotonic_gettime(struct timespec *ts) +{ + ktime_get_ts(ts); +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) { - struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES, - .abs_struct = &abs_list + struct k_clock clock_realtime = { + .clock_getres = ktimer_get_res_real, + .clock_get = posix_ktime_get_real_ts, + .timer_create = timer_create_real, }; - struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES, - .abs_struct = NULL, - .clock_get = do_posix_clock_monotonic_get, - .clock_set = do_posix_clock_nosettime + struct k_clock clock_monotonic = { + .clock_getres = ktimer_get_res, + .clock_get = posix_ktime_get_ts, + .clock_set = do_posix_clock_nosettime, + .timer_create = timer_create_mono, }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); @@ -265,117 +279,17 @@ static __init int init_posix_timers(void __initcall(init_posix_timers); -static void tstojiffie(struct timespec *tp, int res, u64 *jiff) -{ - long sec = tp->tv_sec; - long nsec = tp->tv_nsec + res - 1; - - if (nsec > NSEC_PER_SEC) { - sec++; - nsec -= NSEC_PER_SEC; - } - - /* - * The scaling constants are defined in - * The difference between there and here is that we do the - * res rounding and compute a 64-bit result (well so does that - * but it then throws away the high bits). - */ - *jiff = (mpy_l_X_l_ll(sec, SEC_CONVERSION) + - (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >> - (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; -} - -/* - * This function adjusts the timer as needed as a result of the clock - * being set. It should only be called for absolute timers, and then - * under the abs_list lock. It computes the time difference and sets - * the new jiffies value in the timer. It also updates the timers - * reference wall_to_monotonic value. It is complicated by the fact - * that tstojiffies() only handles positive times and it needs to work - * with both positive and negative times. Also, for negative offsets, - * we need to defeat the res round up. - * - * Return is true if there is a new time, else false. - */ -static long add_clockset_delta(struct k_itimer *timr, - struct timespec *new_wall_to) -{ - struct timespec delta; - int sign = 0; - u64 exp; - - set_normalized_timespec(&delta, - new_wall_to->tv_sec - - timr->it.real.wall_to_prev.tv_sec, - new_wall_to->tv_nsec - - timr->it.real.wall_to_prev.tv_nsec); - if (likely(!(delta.tv_sec | delta.tv_nsec))) - return 0; - if (delta.tv_sec < 0) { - set_normalized_timespec(&delta, - -delta.tv_sec, - 1 - delta.tv_nsec - - posix_clocks[timr->it_clock].res); - sign++; - } - tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp); - timr->it.real.wall_to_prev = *new_wall_to; - timr->it.real.timer.expires += (sign ? -exp : exp); - return 1; -} - -static void remove_from_abslist(struct k_itimer *timr) -{ - if (!list_empty(&timr->it.real.abs_timer_entry)) { - spin_lock(&abs_list.lock); - list_del_init(&timr->it.real.abs_timer_entry); - spin_unlock(&abs_list.lock); - } -} static void schedule_next_timer(struct k_itimer *timr) { - struct timespec new_wall_to; - struct now_struct now; - unsigned long seq; - - /* - * Set up the timer for the next interval (if there is one). - * Note: this code uses the abs_timer_lock to protect - * it.real.wall_to_prev and must hold it until exp is set, not exactly - * obvious... - - * This function is used for CLOCK_REALTIME* and - * CLOCK_MONOTONIC* timers. If we ever want to handle other - * CLOCKs, the calling code (do_schedule_next_timer) would need - * to pull the "clock" info from the timer and dispatch the - * "other" CLOCKs "next timer" code (which, I suppose should - * also be added to the k_clock structure). - */ - if (!timr->it.real.incr) + if (ktime_cmp_val(timr->it.real.incr, ==, KTIME_ZERO)) return; - do { - seq = read_seqbegin(&xtime_lock); - new_wall_to = wall_to_monotonic; - posix_get_now(&now); - } while (read_seqretry(&xtime_lock, seq)); - - if (!list_empty(&timr->it.real.abs_timer_entry)) { - spin_lock(&abs_list.lock); - add_clockset_delta(timr, &new_wall_to); - - posix_bump_timer(timr, now); - - spin_unlock(&abs_list.lock); - } else { - posix_bump_timer(timr, now); - } - timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1; + timr->it_overrun_last = timr->it.real.overrun; + timr->it.real.overrun = timr->it.real.timer.overrun = -1; ++timr->it_requeue_pending; - add_timer(&timr->it.real.timer); + ktimer_start(&timr->it.real.timer, &timr->it.real.incr, KTIMER_FORWARD); + timr->it.real.overrun = timr->it.real.timer.overrun; } /* @@ -413,14 +327,7 @@ int posix_timer_event(struct k_itimer *t { memset(&timr->sigq->info, 0, sizeof(siginfo_t)); timr->sigq->info.si_sys_private = si_private; - /* - * Send signal to the process that owns this timer. - - * This code assumes that all the possible abs_lists share the - * same lock (there is only one list at this time). If this is - * not the case, the CLOCK info would need to be used to find - * the proper abs list lock. - */ + /* Send signal to the process that owns this timer.*/ timr->sigq->info.si_signo = timr->it_sigev_signo; timr->sigq->info.si_errno = 0; @@ -454,65 +361,28 @@ EXPORT_SYMBOL_GPL(posix_timer_event); * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. */ -static void posix_timer_fn(unsigned long __data) +static void posix_timer_fn(void *data) { - struct k_itimer *timr = (struct k_itimer *) __data; + struct k_itimer *timr = data; unsigned long flags; - unsigned long seq; - struct timespec delta, new_wall_to; - u64 exp = 0; - int do_notify = 1; + int si_private = 0; spin_lock_irqsave(&timr->it_lock, flags); - if (!list_empty(&timr->it.real.abs_timer_entry)) { - spin_lock(&abs_list.lock); - do { - seq = read_seqbegin(&xtime_lock); - new_wall_to = wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); - set_normalized_timespec(&delta, - new_wall_to.tv_sec - - timr->it.real.wall_to_prev.tv_sec, - new_wall_to.tv_nsec - - timr->it.real.wall_to_prev.tv_nsec); - if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) { - /* do nothing, timer is on time */ - } else if (delta.tv_sec < 0) { - /* do nothing, timer is already late */ - } else { - /* timer is early due to a clock set */ - tstojiffie(&delta, - posix_clocks[timr->it_clock].res, - &exp); - timr->it.real.wall_to_prev = new_wall_to; - timr->it.real.timer.expires += exp; - add_timer(&timr->it.real.timer); - do_notify = 0; - } - spin_unlock(&abs_list.lock); - } - if (do_notify) { - int si_private=0; + if (ktime_cmp_val(timr->it.real.incr, !=, KTIME_ZERO)) + si_private = ++timr->it_requeue_pending; - if (timr->it.real.incr) - si_private = ++timr->it_requeue_pending; - else { - remove_from_abslist(timr); - } + if (posix_timer_event(timr, si_private)) + /* + * signal was not sent because of sig_ignor + * we will not get a call back to restart it AND + * it should be restarted. + */ + schedule_next_timer(timr); - if (posix_timer_event(timr, si_private)) - /* - * signal was not sent because of sig_ignor - * we will not get a call back to restart it AND - * it should be restarted. - */ - schedule_next_timer(timr); - } unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */ } - static inline struct task_struct * good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; @@ -776,39 +646,41 @@ static struct k_itimer * lock_timer(time static void common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) { - unsigned long expires; - struct now_struct now; + ktime_t expires, now, remaining; + struct ktimer *timer = &timr->it.real.timer; - do - expires = timr->it.real.timer.expires; - while ((volatile long) (timr->it.real.timer.expires) != expires); - - posix_get_now(&now); - - if (expires && - ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) && - !timr->it.real.incr && - posix_time_before(&timr->it.real.timer, &now)) - timr->it.real.timer.expires = expires = 0; - if (expires) { - if (timr->it_requeue_pending & REQUEUE_PENDING || - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { - posix_bump_timer(timr, now); - expires = timr->it.real.timer.expires; - } - else - if (!timer_pending(&timr->it.real.timer)) - expires = 0; - if (expires) - expires -= now.jiffies; - } - jiffies_to_timespec(expires, &cur_setting->it_value); - jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval); - - if (cur_setting->it_value.tv_sec < 0) { + memset(cur_setting, 0, sizeof(struct itimerspec)); + expires = ktimer_get_expiry(timer, &now); + remaining = ktime_sub(expires, now); + + /* Time left ? or timer pending */ + if (ktime_cmp_val(remaining, >, KTIME_ZERO) || ktimer_active(timer)) + goto calci; + /* interval timer ? */ + if (ktime_cmp_val(timr->it.real.incr, ==, 0)) + return; + /* + * When a requeue is pending or this is a SIGEV_NONE timer + * move the expiry time forward by intervals, so expiry is > + * now. + * The active (non SIGEV_NONE) rearm should be done + * automatically by the ktimer REARM mode. Thats the next + * iteration. The REQUEUE_PENDING part will go away ! + */ + if (timr->it_requeue_pending & REQUEUE_PENDING || + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { + remaining = forward_posix_timer(timr, now); + } + calci: + /* interval timer ? */ + if (ktime_cmp_val(timr->it.real.incr, !=, KTIME_ZERO)) + ktime_to_timespec(&cur_setting->it_interval, + timr->it.real.incr); + /* Return 0 only, when the timer is expired and not pending */ + if (ktime_cmp_val(remaining, <=, KTIME_ZERO)) cur_setting->it_value.tv_nsec = 1; - cur_setting->it_value.tv_sec = 0; - } + else + ktime_to_timespec(&cur_setting->it_value, remaining); } /* Get the time remaining on a POSIX.1b interval timer. */ @@ -832,6 +704,7 @@ sys_timer_gettime(timer_t timer_id, stru return 0; } + /* * Get the number of overruns of a POSIX.1b interval timer. This is to * be the overrun of the timer last delivered. At the same time we are @@ -858,84 +731,6 @@ sys_timer_getoverrun(timer_t timer_id) return overrun; } -/* - * Adjust for absolute time - * - * If absolute time is given and it is not CLOCK_MONOTONIC, we need to - * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and - * what ever clock he is using. - * - * If it is relative time, we need to add the current (CLOCK_MONOTONIC) - * time to it to get the proper time for the timer. - */ -static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, - int abs, u64 *exp, struct timespec *wall_to) -{ - struct timespec now; - struct timespec oc = *tp; - u64 jiffies_64_f; - int rtn =0; - - if (abs) { - /* - * The mask pick up the 4 basic clocks - */ - if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) { - jiffies_64_f = do_posix_clock_monotonic_gettime_parts( - &now, wall_to); - /* - * If we are doing a MONOTONIC clock - */ - if((clock - &posix_clocks[0]) & CLOCKS_MONO){ - now.tv_sec += wall_to->tv_sec; - now.tv_nsec += wall_to->tv_nsec; - } - } else { - /* - * Not one of the basic clocks - */ - clock->clock_get(clock - posix_clocks, &now); - jiffies_64_f = get_jiffies_64(); - } - /* - * Take away now to get delta and normalize - */ - set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec, - oc.tv_nsec - now.tv_nsec); - }else{ - jiffies_64_f = get_jiffies_64(); - } - /* - * Check if the requested time is prior to now (if so set now) - */ - if (oc.tv_sec < 0) - oc.tv_sec = oc.tv_nsec = 0; - - if (oc.tv_sec | oc.tv_nsec) - set_normalized_timespec(&oc, oc.tv_sec, - oc.tv_nsec + clock->res); - tstojiffie(&oc, clock->res, exp); - - /* - * Check if the requested time is more than the timer code - * can handle (if so we error out but return the value too). - */ - if (*exp > ((u64)MAX_JIFFY_OFFSET)) - /* - * This is a considered response, not exactly in - * line with the standard (in fact it is silent on - * possible overflows). We assume such a large - * value is ALMOST always a programming error and - * try not to compound it by setting a really dumb - * value. - */ - rtn = -EINVAL; - /* - * return the actual jiffies expire time, full 64 bits - */ - *exp += jiffies_64_f; - return rtn; -} /* Set a POSIX.1b interval timer. */ /* timr->it_lock is taken. */ @@ -943,68 +738,52 @@ static inline int common_timer_set(struct k_itimer *timr, int flags, struct itimerspec *new_setting, struct itimerspec *old_setting) { - struct k_clock *clock = &posix_clocks[timr->it_clock]; - u64 expire_64; + ktime_t expires; + int mode; if (old_setting) common_timer_get(timr, old_setting); /* disable the timer */ - timr->it.real.incr = 0; + ktime_set_scalar(timr->it.real.incr, KTIME_ZERO); /* * careful here. If smp we could be in the "fire" routine which will * be spinning as we hold the lock. But this is ONLY an SMP issue. */ - if (try_to_del_timer_sync(&timr->it.real.timer) < 0) { -#ifdef CONFIG_SMP - /* - * It can only be active if on an other cpu. Since - * we have cleared the interval stuff above, it should - * clear once we release the spin lock. Of course once - * we do that anything could happen, including the - * complete melt down of the timer. So return with - * a "retry" exit status. - */ + if (ktimer_try_to_cancel(&timr->it.real.timer) < 0) return TIMER_RETRY; -#endif - } - - remove_from_abslist(timr); timr->it_requeue_pending = (timr->it_requeue_pending + 2) & ~REQUEUE_PENDING; timr->it_overrun_last = 0; timr->it_overrun = -1; - /* - *switch off the timer when it_value is zero - */ - if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) { - timr->it.real.timer.expires = 0; + + /* switch off the timer when it_value is zero */ + if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - } - if (adjust_abs_time(clock, - &new_setting->it_value, flags & TIMER_ABSTIME, - &expire_64, &(timr->it.real.wall_to_prev))) { - return -EINVAL; - } - timr->it.real.timer.expires = (unsigned long)expire_64; - tstojiffie(&new_setting->it_interval, clock->res, &expire_64); - timr->it.real.incr = (unsigned long)expire_64; + mode = flags & TIMER_ABSTIME ? KTIMER_ABS : KTIMER_REL; - /* - * We do not even queue SIGEV_NONE timers! But we do put them - * in the abs list so we can do that right. + /* Posix madness. Only absolute CLOCK_REALTIME timers + * are affected by clock sets. So we must reiniatilize + * the timer. */ + if (timr->it_clock == CLOCK_REALTIME && mode == KTIMER_ABS) + timer_create_real(timr); + else + timer_create_mono(timr); + + expires = timespec_to_ktime(new_setting->it_value); + + /* Convert and round the interval */ + timr->it.real.incr = ktimer_round_timespec(&timr->it.real.timer, + &new_setting->it_interval); + + /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)) - add_timer(&timr->it.real.timer); + ktimer_start(&timr->it.real.timer, &expires, + mode | KTIMER_NOCHECK | KTIMER_ROUND); - if (flags & TIMER_ABSTIME && clock->abs_struct) { - spin_lock(&clock->abs_struct->lock); - list_add_tail(&(timr->it.real.abs_timer_entry), - &(clock->abs_struct->list)); - spin_unlock(&clock->abs_struct->lock); - } return 0; } @@ -1039,6 +818,7 @@ retry: unlock_timer(timr, flag); if (error == TIMER_RETRY) { + wait_for_ktimer(&timr->it.real.timer); rtn = NULL; // We already got the old time... goto retry; } @@ -1052,24 +832,10 @@ retry: static inline int common_timer_del(struct k_itimer *timer) { - timer->it.real.incr = 0; + ktime_set_scalar(timer->it.real.incr, KTIME_ZERO); - if (try_to_del_timer_sync(&timer->it.real.timer) < 0) { -#ifdef CONFIG_SMP - /* - * It can only be active if on an other cpu. Since - * we have cleared the interval stuff above, it should - * clear once we release the spin lock. Of course once - * we do that anything could happen, including the - * complete melt down of the timer. So return with - * a "retry" exit status. - */ + if (ktimer_try_to_cancel(&timer->it.real.timer) < 0) return TIMER_RETRY; -#endif - } - - remove_from_abslist(timer); - return 0; } @@ -1085,24 +851,17 @@ sys_timer_delete(timer_t timer_id) struct k_itimer *timer; long flags; -#ifdef CONFIG_SMP - int error; retry_delete: -#endif timer = lock_timer(timer_id, &flags); if (!timer) return -EINVAL; -#ifdef CONFIG_SMP - error = timer_delete_hook(timer); - - if (error == TIMER_RETRY) { + if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + wait_for_ktimer(&timer->it.real.timer); goto retry_delete; } -#else - timer_delete_hook(timer); -#endif + spin_lock(¤t->sighand->siglock); list_del(&timer->list); spin_unlock(¤t->sighand->siglock); @@ -1119,6 +878,7 @@ retry_delete: release_posix_timer(timer, IT_ID_SET); return 0; } + /* * return timer owned by the process, used by exit_itimers */ @@ -1126,22 +886,14 @@ static inline void itimer_delete(struct { unsigned long flags; -#ifdef CONFIG_SMP - int error; retry_delete: -#endif spin_lock_irqsave(&timer->it_lock, flags); -#ifdef CONFIG_SMP - error = timer_delete_hook(timer); - - if (error == TIMER_RETRY) { + if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + wait_for_ktimer(&timer->it.real.timer); goto retry_delete; } -#else - timer_delete_hook(timer); -#endif list_del(&timer->list); /* * This keeps any tasks waiting on the spin lock from thinking @@ -1170,60 +922,7 @@ void exit_itimers(struct signal_struct * } } -/* - * And now for the "clock" calls - * - * These functions are called both from timer functions (with the timer - * spin_lock_irq() held and from clock calls with no locking. They must - * use the save flags versions of locks. - */ - -/* - * We do ticks here to avoid the irq lock ( they take sooo long). - * The seqlock is great here. Since we a reader, we don't really care - * if we are interrupted since we don't take lock that will stall us or - * any other cpu. Voila, no irq lock is needed. - * - */ - -static u64 do_posix_clock_monotonic_gettime_parts( - struct timespec *tp, struct timespec *mo) -{ - u64 jiff; - unsigned int seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(tp); - *mo = wall_to_monotonic; - jiff = jiffies_64; - - } while(read_seqretry(&xtime_lock, seq)); - - return jiff; -} - -static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp) -{ - struct timespec wall_to_mono; - - do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono); - - tp->tv_sec += wall_to_mono.tv_sec; - tp->tv_nsec += wall_to_mono.tv_nsec; - - if ((tp->tv_nsec - NSEC_PER_SEC) > 0) { - tp->tv_nsec -= NSEC_PER_SEC; - tp->tv_sec++; - } - return 0; -} - -int do_posix_clock_monotonic_gettime(struct timespec *tp) -{ - return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp); -} - +/* Not available / possible... functions */ int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp) { return -EINVAL; @@ -1236,7 +935,8 @@ int do_posix_clock_notimer_create(struct } EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create); -int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t) +int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t, + struct timespec __user *r) { #ifndef ENOTSUP return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ @@ -1295,125 +995,34 @@ sys_clock_getres(clockid_t which_clock, return error; } -static void nanosleep_wake_up(unsigned long __data) -{ - struct task_struct *p = (struct task_struct *) __data; - - wake_up_process(p); -} - /* - * The standard says that an absolute nanosleep call MUST wake up at - * the requested time in spite of clock settings. Here is what we do: - * For each nanosleep call that needs it (only absolute and not on - * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure - * into the "nanosleep_abs_list". All we need is the task_struct pointer. - * When ever the clock is set we just wake up all those tasks. The rest - * is done by the while loop in clock_nanosleep(). - * - * On locking, clock_was_set() is called from update_wall_clock which - * holds (or has held for it) a write_lock_irq( xtime_lock) and is - * called from the timer bh code. Thus we need the irq save locks. - * - * Also, on the call from update_wall_clock, that is done as part of a - * softirq thing. We don't want to delay the system that much (possibly - * long list of timers to fix), so we defer that work to keventd. + * nanosleep for monotonic and realtime clocks */ - -static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue); -static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL); - -static DECLARE_MUTEX(clock_was_set_lock); - -void clock_was_set(void) +static int common_nsleep(clockid_t which_clock, int flags, + struct timespec *tsave, struct timespec __user *rmtp) { - struct k_itimer *timr; - struct timespec new_wall_to; - LIST_HEAD(cws_list); - unsigned long seq; - + int mode = flags & TIMER_ABSTIME ? KTIMER_ABS : KTIMER_REL; - if (unlikely(in_interrupt())) { - schedule_work(&clock_was_set_work); - return; + switch (which_clock) { + case CLOCK_REALTIME: + /* Posix madness. Only absolute timers on clock realtime + are affected by clock set. */ + if (mode == KTIMER_ABS) + return ktimer_nanosleep_real(tsave, rmtp, mode); + case CLOCK_MONOTONIC: + return ktimer_nanosleep(tsave, rmtp, mode); + default: + break; } - wake_up_all(&nanosleep_abs_wqueue); - - /* - * Check if there exist TIMER_ABSTIME timers to correct. - * - * Notes on locking: This code is run in task context with irq - * on. We CAN be interrupted! All other usage of the abs list - * lock is under the timer lock which holds the irq lock as - * well. We REALLY don't want to scan the whole list with the - * interrupt system off, AND we would like a sequence lock on - * this code as well. Since we assume that the clock will not - * be set often, it seems ok to take and release the irq lock - * for each timer. In fact add_timer will do this, so this is - * not an issue. So we know when we are done, we will move the - * whole list to a new location. Then as we process each entry, - * we will move it to the actual list again. This way, when our - * copy is empty, we are done. We are not all that concerned - * about preemption so we will use a semaphore lock to protect - * aginst reentry. This way we will not stall another - * processor. It is possible that this may delay some timers - * that should have expired, given the new clock, but even this - * will be minimal as we will always update to the current time, - * even if it was set by a task that is waiting for entry to - * this code. Timers that expire too early will be caught by - * the expire code and restarted. - - * Absolute timers that repeat are left in the abs list while - * waiting for the task to pick up the signal. This means we - * may find timers that are not in the "add_timer" list, but are - * in the abs list. We do the same thing for these, save - * putting them back in the "add_timer" list. (Note, these are - * left in the abs list mainly to indicate that they are - * ABSOLUTE timers, a fact that is used by the re-arm code, and - * for which we have no other flag.) - - */ - - down(&clock_was_set_lock); - spin_lock_irq(&abs_list.lock); - list_splice_init(&abs_list.list, &cws_list); - spin_unlock_irq(&abs_list.lock); - do { - do { - seq = read_seqbegin(&xtime_lock); - new_wall_to = wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); - - spin_lock_irq(&abs_list.lock); - if (list_empty(&cws_list)) { - spin_unlock_irq(&abs_list.lock); - break; - } - timr = list_entry(cws_list.next, struct k_itimer, - it.real.abs_timer_entry); - - list_del_init(&timr->it.real.abs_timer_entry); - if (add_clockset_delta(timr, &new_wall_to) && - del_timer(&timr->it.real.timer)) /* timer run yet? */ - add_timer(&timr->it.real.timer); - list_add(&timr->it.real.abs_timer_entry, &abs_list.list); - spin_unlock_irq(&abs_list.lock); - } while (1); - - up(&clock_was_set_lock); + return -EINVAL; } -long clock_nanosleep_restart(struct restart_block *restart_block); - asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags, const struct timespec __user *rqtp, struct timespec __user *rmtp) { struct timespec t; - struct restart_block *restart_block = - &(current_thread_info()->restart_block); - int ret; if (invalid_clockid(which_clock)) return -EINVAL; @@ -1421,135 +1030,8 @@ sys_clock_nanosleep(clockid_t which_cloc if (copy_from_user(&t, rqtp, sizeof (struct timespec))) return -EFAULT; - if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0) + if (!timespec_valid(&t)) return -EINVAL; - /* - * Do this here as nsleep function does not have the real address. - */ - restart_block->arg1 = (unsigned long)rmtp; - - ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t)); - - if ((ret == -ERESTART_RESTARTBLOCK) && rmtp && - copy_to_user(rmtp, &t, sizeof (t))) - return -EFAULT; - return ret; -} - - -static int common_nsleep(clockid_t which_clock, - int flags, struct timespec *tsave) -{ - struct timespec t, dum; - struct timer_list new_timer; - DECLARE_WAITQUEUE(abs_wqueue, current); - u64 rq_time = (u64)0; - s64 left; - int abs; - struct restart_block *restart_block = - ¤t_thread_info()->restart_block; - - abs_wqueue.flags = 0; - init_timer(&new_timer); - new_timer.expires = 0; - new_timer.data = (unsigned long) current; - new_timer.function = nanosleep_wake_up; - abs = flags & TIMER_ABSTIME; - - if (restart_block->fn == clock_nanosleep_restart) { - /* - * Interrupted by a non-delivered signal, pick up remaining - * time and continue. Remaining time is in arg2 & 3. - */ - restart_block->fn = do_no_restart_syscall; - - rq_time = restart_block->arg3; - rq_time = (rq_time << 32) + restart_block->arg2; - if (!rq_time) - return -EINTR; - left = rq_time - get_jiffies_64(); - if (left <= (s64)0) - return 0; /* Already passed */ - } - - if (abs && (posix_clocks[which_clock].clock_get != - posix_clocks[CLOCK_MONOTONIC].clock_get)) - add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue); - - do { - t = *tsave; - if (abs || !rq_time) { - adjust_abs_time(&posix_clocks[which_clock], &t, abs, - &rq_time, &dum); - } - - left = rq_time - get_jiffies_64(); - if (left >= (s64)MAX_JIFFY_OFFSET) - left = (s64)MAX_JIFFY_OFFSET; - if (left < (s64)0) - break; - - new_timer.expires = jiffies + left; - __set_current_state(TASK_INTERRUPTIBLE); - add_timer(&new_timer); - - schedule(); - - del_timer_sync(&new_timer); - left = rq_time - get_jiffies_64(); - } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); - - if (abs_wqueue.task_list.next) - finish_wait(&nanosleep_abs_wqueue, &abs_wqueue); - - if (left > (s64)0) { - - /* - * Always restart abs calls from scratch to pick up any - * clock shifting that happened while we are away. - */ - if (abs) - return -ERESTARTNOHAND; - - left *= TICK_NSEC; - tsave->tv_sec = div_long_long_rem(left, - NSEC_PER_SEC, - &tsave->tv_nsec); - /* - * Restart works by saving the time remaing in - * arg2 & 3 (it is 64-bits of jiffies). The other - * info we need is the clock_id (saved in arg0). - * The sys_call interface needs the users - * timespec return address which _it_ saves in arg1. - * Since we have cast the nanosleep call to a clock_nanosleep - * both can be restarted with the same code. - */ - restart_block->fn = clock_nanosleep_restart; - restart_block->arg0 = which_clock; - /* - * Caller sets arg1 - */ - restart_block->arg2 = rq_time & 0xffffffffLL; - restart_block->arg3 = rq_time >> 32; - - return -ERESTART_RESTARTBLOCK; - } - - return 0; -} -/* - * This will restart clock_nanosleep. - */ -long -clock_nanosleep_restart(struct restart_block *restart_block) -{ - struct timespec t; - int ret = common_nsleep(restart_block->arg0, 0, &t); - - if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 && - copy_to_user((struct timespec __user *)(restart_block->arg1), &t, - sizeof (t))) - return -EFAULT; - return ret; + return CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t, rmtp)); } Index: linux-2.6.14/Documentation/DocBook/kernel-api.tmpl =================================================================== --- linux-2.6.14.orig/Documentation/DocBook/kernel-api.tmpl +++ linux-2.6.14/Documentation/DocBook/kernel-api.tmpl @@ -54,6 +54,11 @@ !Ekernel/sched.c !Ekernel/timer.c + High-precision timers +!Iinclude/linux/ktime.h +!Iinclude/linux/ktimer.h +!Ekernel/ktimers.c + Internal Functions !Ikernel/exit.c !Ikernel/signal.c Index: linux-2.6.14/Documentation/ktimers.txt =================================================================== --- /dev/null +++ linux-2.6.14/Documentation/ktimers.txt @@ -0,0 +1,239 @@ + +ktimers - subsystem for high-precision kernel timers +---------------------------------------------------- + +This patch introduces a new subsystem for high-precision kernel timers. + +Why two timer subsystems? After a lot of back and forth trying to +integrate high-precision and high-resolution features into the existing +timer framework, and after testing various such high-resolution timer +implementations in practice, we came to the conclusion that the timer +wheel code is fundamentally not suitable for such an approach. We +initially didnt believe this ('there must be a way to solve this'), and +we spent a considerable effort trying to integrate things into the timer +wheel, but we failed. There are several reasons why such integration is +impossible: + +- the forced handling of low-resolution and high-resolution timers in + the same way leads to a lot of compromises, macro magic and #ifdef + mess. The timers.c code is very "tightly coded" around jiffies and + 32-bitness assumptions, and has been honed and micro-optimized for a + narrow use case for many years - and thus even small extensions to it + frequently break the wheel concept, leading to even worse + compromises. + +- the unpredictable [O(N)] overhead of cascading leads to delays which + necessiate a more complex handling of high resolution timers, which + decreases robustness. Such a design still led to rather large timing + inaccuracies. Cascading is a fundamental property of the timer wheel + concept, it cannot be 'designed out' without unevitabling degrading + other portions of the timers.c code in an unacceptable way. + +- the implementation of the current posix-timer subsystem on top of + the timer wheel has already introduced a quite complex handling of + the required readjusting of absolute CLOCK_REALTIME timers at + settimeofday or NTP time - showing the rigidity of the timer wheel + data structure. + +- the timer wheel code is most optimal for use cases which can be + identified as "timeouts". Such timeouts are usually set up to cover + error conditions in various I/O paths, such as networking and block + I/O. The vast majority of those timers never expire and are rarely + recascaded because the expected correct event arrives in time so they + can be removed from the timer wheel before any further processing of + them becomes necessary. Thus the users of these timeouts can accept + the granularity and precision tradeoffs of the timer wheel, and + largely expect the timer subsystem to have near-zero overhead. Timing + for them is not a core purpose, it's most a necessary evil to + guarantee the processing of requests, which should be as cheap and + unintrusive as possible. + +The primary users of precision timers are user-space applications that +utilize nanosleep, posix-timers and itimer interfaces. Also, in-kernel +users like drivers and subsystems with a requirement for precise timed +events can benefit from the availability of a seperate high-precision +timer subsystem as well. + +The ktimer subsystem is easily extended with high-resolution +capabilities, and patches for that exist and are maturing quickly. The +increasing demand for realtime and multimedia applications along with +other potential users for precise timers gives another reason to +separate the "timeout" and "precise timer" subsystems. + +Another potential benefit is that such seperation allows for future +optimizations of the existing timer wheel implementation for the low +resolution and low precision use cases - once the precision-sensitive +APIs are separated from the timer wheel and are migrated over to +ktimers. E.g. we could decrease the frequency of the timeout subsystem +from 250 Hz to 100 HZ (or even smaller). + +ktimer subsystem implementation details +--------------------------------------- + +the basic design considerations were: + +- simplicity +- robust, extensible abstractions +- data structure not bound to jiffies or any other granularity +- simplification of existing, timing related kernel code + +From our previous experience with various approaches of high-resolution +timers another basic requirement was the immediate enqueueing and +ordering of timers at activation time. After looking at several possible +solutions such as radix trees and hashes, the red black tree was choosen +as the basic data structure. Rbtrees are available as a library in the +kernel and are used in various performance-critical areas of e.g. memory +management and file systems. The rbtree is solely used for the time +sorted ordering, while a seperate list is used to give the expiry code +fast access to the queued timers, without having to walk the rbtree. +(This seperate list is also useful for high-resolution timers where we +need seperate pending and expired queues while keeping the time-order +intact.) + +The time-ordered enqueueing is not purely for the purposes of the +high-resolution timers extension though, it also simplifies the handling +of absolute timers based on CLOCK_REALTIME. The existing implementation +needed to keep an extra list of all armed absolute CLOCK_REALTIME timers +along with complex locking. In case of settimeofday and NTP, all the +timers (!) had to be dequeued, the time-changing code had to fix them up +one by one, and all of them had to be enqueued again. The time-ordered +enqueueing and the storage of the expiry time in absolute time units +removes all this complex and poorly scaling code from the posix-timer +implementation - the clock can simply be set without having to touch the +rbtree. This also makes the handling of posix-timers simpler in general. + +The locking and per-CPU behavior of ktimers was mostly taken from the +existing timer wheel code, as it is mature and well suited. Sharing code +was not really a win, due to the different data structures. Also, the +ktimer functions now have clearer behavior and clearer names - such as +ktimer_try_to_cancel() and ktimer_cancel() [which are roughly equivalent +to del_timer() and del_timer_sync()] - and there's no direct 1:1 mapping +between them on the algorithmical level. + +The internal representation of time values (ktime_t) is implemented via +macros and inline functions, and can be switched between a "hybrid +union" type and a plain "scalar" 64bit nanoseconds representation (at +compile time). The hybrid union type exists to optimize time conversions +on 32bit CPUs. This build-time-selectable ktime_t storage format was +implemented to avoid the performance impact of 64-bit multiplications +and divisions on 32bit CPUs. Such operations are frequently necessary to +convert between the storage formats provided by kernel and userspace +interfaces and the internal time format. (See include/linux/ktime.h for +further details.) + +ktimers - rounding of timer values +---------------------------------- + +Why do we need rounding at all ? + +Firstly, the POSIX specification requires rounding to the resolution - +whatever that means. The POSIX specification is quite imprecise on the +details of rounding though, so a practical interpretation had to be +found. + +The first question is which resolution value should be returned to the +user by the clock_getres() interface. + +The simplest case is when the hardware is capable of 1 nsec resolution: +in that case we can fulfill all wishes and there is no rounding :-) + +Another simple case is when the clock hardware has a limited resolution +that the kernel wants to fully offer to user-space: in this case that +limited resolution is returned to userspace. + +The hairy case is when the underlying hardware is capable of finer +grained resolution, but the kernel is not willing to offer that +resolution. Why would the kernel want to do that? Because e.g. the +system could easily be DoS-ed with high-frequency timer interrupts. Or +the kernel might want to cluster high-res timer interrupts into groups +for performance reasons, so that extremely high interrupt rates are +avoided. So the kernel needs some leeway in deciding the 'effective' +resolution that it is willing to expose to userspace. + +In this case, the clock_getres() decision is easy: we want to return the +'effective' resolution, not the 'theoretical' resolution. Thus an +application programmer gets correct information about what granularity +and accuracy to expect from the system. + +What is much less obvious in both the 'hardware is low-res' and 'kernel +wants to offer low-res' cases is the actual behavior of timers, and +where and how to round time values to the 'effective' resolution of the +clock. + +For this we first need to see what types of expiries there exist for +ktimers, and how rounding affects them. Ktimers have the following +variants: + +- relative one-shot timers +- absolute one-shot timers +- relative interval timers +- absolute interval timers + +Interval timers can be led back to one-shot timers: they are a series of +one-shot timers with the same interval. Relative one-shot timers can be +handled identically to absolute one-shot timers after adding the +relative expiry time to the current time of the respective clock. + +We picked to handle two cases of rounding: + +- the rounding of the absolute value of the first expiry time +- the rounding of the timer interval + +An alternative implementation would be to not round the interval and to +implicitly round at every timer event, but it's not clear what the +advantages would be from doing that. There are a couple of +disadvantages: + +- the technique seems to contradict the standard's requirement that + 'time values ... be rounded' (which the interval clearly is). + +- other OSs implement the rounding in the way we implemented it. + +- also, there is an application surprise factor, the 'do not round + intervals' technique can lead to the following sample sequence of + events: + + Interval: 1.7ms + Resolution: 1ms + + Event timeline: + + 2ms - 4ms - 6ms - 7ms - 9ms - 11ms - 12ms - 14ms - 16ms - 17ms ... + + this 2,2,1,2,2,1...msec 'unpredictable and uneven' relative distance + of events could surprise applications. + +(as a sidenote, current POSIX APIs could be extended with a method of +periodic timers to have an 'average' frequency, where there is no +rounding of the interval. No such API exists at the moment.) + +ktimers - testing and verification +---------------------------------- + +We used the high-resolution timer subsystem ontop of ktimers to verify +the ktimer implementation details in praxis, and we also ran the posix +timer tests in order to ensure specification compliance. + +The ktimer patch converts the following kernel functionality to use +ktimers: + + - nanosleep + - itimers + - posix-timers + +The conversion of nanosleep and posix-timers enabled the unification of +nanosleep and clock_nanosleep. + +The code was successfully compiled for the following platforms: + + i386, x86_64, ARM, PPC, PPC64, IA64 + +The code was run-tested on the following platforms: + + i386(UP/SMP), x86_64(UP/SMP), ARM, PPC + +ktimers were also integrated into the -rt tree, along with a +ktimers-based high-resolution timer implementation, so the ktimers code +got a healthy amount of testing and use in practice. + + Thomas Gleixner, Ingo Molnar Index: linux-2.6.14/fs/proc/array.c =================================================================== --- linux-2.6.14.orig/fs/proc/array.c +++ linux-2.6.14/fs/proc/array.c @@ -330,7 +330,7 @@ static int do_task_stat(struct task_stru unsigned long min_flt = 0, maj_flt = 0; cputime_t cutime, cstime, utime, stime; unsigned long rsslim = 0; - unsigned long it_real_value = 0; + DEFINE_KTIME(it_real_value); struct task_struct *t; char tcomm[sizeof(task->comm)]; @@ -386,7 +386,7 @@ static int do_task_stat(struct task_stru utime = cputime_add(utime, task->signal->utime); stime = cputime_add(stime, task->signal->stime); } - it_real_value = task->signal->it_real_value; + it_real_value = task->signal->real_timer.expires; } ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0; read_unlock(&tasklist_lock); @@ -435,7 +435,7 @@ static int do_task_stat(struct task_stru priority, nice, num_threads, - jiffies_to_clock_t(it_real_value), + (long) ktime_to_clock_t(it_real_value), start_time, vsize, mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */ Index: linux-2.6.14/include/linux/calc64.h =================================================================== --- /dev/null +++ linux-2.6.14/include/linux/calc64.h @@ -0,0 +1,49 @@ +#ifndef _linux_CALC64_H +#define _linux_CALC64_H + +#include +#include + +/* + * div_long_long_rem was moved out of jiffies.h as it is + * a general math function useful for other things than + * jiffy code. + * + * This is a generic macro which is used when the architecture + * specific div64.h does not provide a optimized one. + * + * The 64bit dividend is divided by the divisor (data type long), the + * result is returned and the remainder stored in the variable + * referenced by remainder (data type long *). In contrast to the + * do_div macro the dividend is kept intact. + */ +#ifndef div_long_long_rem +#define div_long_long_rem(dividend,divisor,remainder) \ +({ \ + u64 result = dividend; \ + *remainder = do_div(result,divisor); \ + result; \ +}) +#endif + +/* + * Sign aware variation of the above. On some architectures a + * negative dividend leads to an divide overflow exception, which + * is avoided by the sign check. + */ +static inline long div_long_long_rem_signed(long long dividend, + long divisor, + long *remainder) +{ + long res; + + if (unlikely(dividend < 0)) { + res = -div_long_long_rem(-dividend, divisor, remainder); + *remainder = -(*remainder); + } else { + res = div_long_long_rem(dividend, divisor, remainder); + } + return res; +} + +#endif Index: linux-2.6.14/include/linux/jiffies.h =================================================================== --- linux-2.6.14.orig/include/linux/jiffies.h +++ linux-2.6.14/include/linux/jiffies.h @@ -1,21 +1,12 @@ #ifndef _LINUX_JIFFIES_H #define _LINUX_JIFFIES_H +#include #include #include #include #include #include /* for HZ */ -#include - -#ifndef div_long_long_rem -#define div_long_long_rem(dividend,divisor,remainder) \ -({ \ - u64 result = dividend; \ - *remainder = do_div(result,divisor); \ - result; \ -}) -#endif /* * The following defines establish the engineering parameters of the PLL Index: linux-2.6.14/include/linux/ktime.h =================================================================== --- /dev/null +++ linux-2.6.14/include/linux/ktime.h @@ -0,0 +1,390 @@ +/* + * include/linux/ktime.h + * + * ktime_t - nanosecond-resolution time format. + * + * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar + * + * data type definitions, declarations, prototypes and macros. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_KTIME_H +#define _LINUX_KTIME_H + +#include + +/* + * ktime_t: + * + * On 64-bit CPUs a single 64-bit variable is used to store the ktimers + * internal representation of time values in scalar nanoseconds. The + * design plays out best on 64-bit CPUs, where most conversions are + * NOPs and most arithmetic ktime_t operations are plain arithmetic + * operations. + * + * On 32-bit CPUs an optimized representation of the timespec structure + * is used to avoid expensive conversions from and to timespecs. The + * endian-aware order of the tv struct members is choosen to allow + * mathematical operations on the tv64 member of the union too, which + * for certain operations produces better code. + * + * For architectures with efficient support for 64/32-bit conversions the + * plain scalar nanosecond based representation can be selected by the + * config switch CONFIG_KTIME_SCALAR. + */ + +#define KTIME_ZERO 0 +#define KTIME_MAX (~((u64)1 << 63)) + +/* + * ktime_t definitions when using the 64-bit scalar representation: + */ + +#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR) + +typedef s64 ktime_t; + +/* Define a ktime_t variable and initialize it to zero: */ +#define DEFINE_KTIME(kt) ktime_t kt = 0 + +/* + * Compare two ktime_t variables. The comparison operand is + * given as a literal in the macro call (e.g. <, >, ==): + * + * ( E.g. "ktime_cmp(t1, <, t2) is still more readable to programmers + * than ktime_before()/ktime_after() would be. ) + */ +#define ktime_cmp(a, op, b) ((a) op (b)) + +/* + * Compare a ktime_t variable and a constant. The comparison operand is + * given as a literal in the macro call (e.g. <, >, ==): + */ +#define ktime_cmp_val(a, op, b) ((a) op (b)) + +/** + * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value + * + * @secs: seconds to set + * @nsecs: nanoseconds to set + * + * Return the ktime_t representation of the value + */ +#define ktime_set(sec, nsec) (((s64)(sec) * NSEC_PER_SEC) + (s64)(nsec)) + +/* + * Set a ktime_t variable to a value in a scalar nanosecond representation + * + * NOTE: use only with KTIME_ZERO or KTIME_MAX to maintain compability + * with the union type version. + */ +#define ktime_set_scalar(kt, s) (kt) = (s) + +/* + * The following 3 macros are used for the nanosleep restart handling + * to store the "low" and "high" part of a 64-bit ktime variable. + * (on 32-bit CPUs the restart block has 32-bit fields, so we have to + * split the 64-bit value up into two pieces) + * + * In the scalar representation we have to split up the 64-bit scalar: + */ + +/* Set the "low" and "high" part of a ktime_t variable: */ +#define ktime_set_low_high(l, h) ((s64)((u64)(l)) | (((s64)(h)) << 32)) + +/* Get the "low" part of a ktime_t variable: */ +#define ktime_get_low(kt) ((kt) & 0xFFFFFFFF) + +/* Get the "high" part of a ktime_t variable: */ +#define ktime_get_high(kt) ((kt) >> 32) + +/* Subtract two ktime_t variables. rem = lhs -rhs: */ +#define ktime_sub(lhs, rhs) ((lhs) - (rhs)) + +/* Add two ktime_t variables. res = lhs + rhs: */ +#define ktime_add(lhs, rhs) ((lhs) + (rhs)) + +/* + * Add a ktime_t variable and a scalar nanosecond value. + * res = kt + nsval: + */ +#define ktime_add_ns(kt, nsval) ((kt) + (nsval)) + +/* convert a timespec to ktime_t format: */ +#define timespec_to_ktime(ts) ktime_set((ts).tv_sec, (ts).tv_nsec) + +/* convert a timeval to ktime_t format: */ +#define timeval_to_ktime(tv) ktime_set((tv).tv_sec, (tv).tv_usec * 1000) + +/* Map the ktime_t to timespec conversion to ns_to_timespec function */ +#define ktime_to_timespec(ts, kt) ns_to_timespec(ts, kt) + +/* Map the ktime_t to timeval conversion to ns_to_timeval function */ +#define ktime_to_timeval(tv, kt) ns_to_timeval(tv, kt) + +/* Map the ktime_t to clock_t conversion to the inline in jiffies.h: */ +#define ktime_to_clock_t(kt) nsec_to_clock_t(kt) + +/* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */ +#define ktime_to_ns(kt) (kt) + +#if (BITS_PER_LONG == 64) +/* + * Calc ktime_t modulo div. + * div is less than NSEC_PER_SEC and (NSEC_PER_SEC % div) = 0 ! + */ +#define ktime_modulo(kt, div) (unsigned long)(kt % div) +#endif + +#else + +/* + * Helper macros/inlines to get the ktime_t math right in the timespec + * representation. The macros are sometimes ugly - their actual use is + * pretty okay-ish, given the circumstances. We do all this for + * performance reasons. The pure scalar nsec_t based code was nice and + * simple, but created too many 64-bit / 32-bit conversions and divisions. + * + * Be especially aware that negative values are represented in a way + * that the tv.sec field is negative and the tv.nsec field is greater + * or equal to zero but less than nanoseconds per second. This is the + * same representation which is used by timespecs. + * + * tv.sec < 0 and 0 >= tv.nsec < NSEC_PER_SEC + */ + +typedef union { + s64 tv64; + struct { +#ifdef __BIG_ENDIAN + s32 sec, nsec; +#else + s32 nsec, sec; +#endif + } tv; +} ktime_t; + +/* Define a ktime_t variable and initialize it to zero: */ +#define DEFINE_KTIME(kt) ktime_t kt = { .tv64 = 0 } + +/* + * Compare two ktime_t variables. The comparison operand is + * given as a literal in the macro call (e.g. <, >, ==): + */ +#define ktime_cmp(a, op, b) ((a).tv64 op (b).tv64) + +/* + * Compare a ktime_t variable and a constant. The comparison operand is + * given as a literal in the macro call (e.g. <, >, ==): + */ +#define ktime_cmp_val(a, op, b) ((a).tv64 op (b)) + +/* Set a ktime_t variable to a value in sec/nsec representation: */ +static inline ktime_t ktime_set(long secs, unsigned long nsecs) +{ + return (ktime_t) { .tv = { .sec = secs, .nsec = nsecs } }; +} + +/* + * Set the scalar value of a ktime variable (union type) + * NOTE: use only with KTIME_ZERO or KTIME_MAX! + */ +#define ktime_set_scalar(kt, s) (kt).tv64 = (s) + +/* + * The following 3 macros are used for the nanosleep restart handling + * to store the "low" and "high" part of a 64-bit ktime variable. + * (on 32-bit CPUs the restart block has 32-bit fields, so we have to + * split the 64-bit value up into two pieces) + * + * In the union type representation this is just storing and restoring + * the sec and nsec members of the tv structure: + */ + +/* Set the "low" and "high" part of a ktime_t variable: */ +#define ktime_set_low_high(l, h) ktime_set(h, l) + +/* Get the "low" part of a ktime_t variable: */ +#define ktime_get_low(kt) (kt).tv.nsec + +/* Get the "high" part of a ktime_t variable: */ +#define ktime_get_high(kt) (kt).tv.sec + +/** + * ktime_sub - subtract two ktime_t variables + * + * @lhs: minuend + * @rhs: subtrahend + * + * Returns the remainder of the substraction + */ +static inline ktime_t ktime_sub(ktime_t lhs, ktime_t rhs) +{ + ktime_t res; + + res.tv64 = lhs.tv64 - rhs.tv64; + if (res.tv.nsec < 0) + res.tv.nsec += NSEC_PER_SEC; + + return res; +} + +/** + * ktime_add - add two ktime_t variables + * + * @add1: addend1 + * @add2: addend2 + * + * Returns the sum of addend1 and addend2 + */ +static inline ktime_t ktime_add(ktime_t add1, ktime_t add2) +{ + ktime_t res; + + res.tv64 = add1.tv64 + add2.tv64; + /* + * performance trick: the (u32) -NSEC gives 0x00000000Fxxxxxxx + * so we subtract NSEC_PER_SEC and add 1 to the upper 32 bit. + * + * it's equivalent to: + * tv.nsec -= NSEC_PER_SEC + * tv.sec ++; + */ + if (res.tv.nsec >= NSEC_PER_SEC) + res.tv64 += (u32)-NSEC_PER_SEC; + + return res; +} + +/** + * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable + * + * @kt: addend + * @nsec: the scalar nsec value to add + * + * Returns the sum of kt and nsec in ktime_t format + */ +extern ktime_t ktime_add_ns(ktime_t kt, u64 nsec); + +/** + * timespec_to_ktime - convert a timespec to ktime_t format + * + * @ts: the timespec variable to convert + * + * Returns a ktime_t variable with the converted timespec value + */ +static inline ktime_t timespec_to_ktime(struct timespec ts) +{ + return (ktime_t) { .tv = { .sec = (s32)ts.tv_sec, + .nsec = (s32)ts.tv_nsec } }; +} + +/** + * timeval_to_ktime - convert a timeval to ktime_t format + * + * @tv: the timeval variable to convert + * + * Returns a ktime_t variable with the converted timeval value + */ +static inline ktime_t timeval_to_ktime(struct timeval tv) +{ + return (ktime_t) { .tv = { .sec = (s32)tv.tv_sec, + .nsec = (s32)tv.tv_usec * 1000 } }; +} + +/** + * ktime_to_timespec - convert a ktime_t variable to timespec format + * + * @ts: pointer to timespec variable to store result + * @kt: the ktime_t variable to convert + * + * Stores the timespec representation of the ktime value in + * the timespec variable pointed to by @ts + */ +static inline void ktime_to_timespec(struct timespec *ts, ktime_t kt) +{ + ts->tv_sec = (time_t) kt.tv.sec; + ts->tv_nsec = (long) kt.tv.nsec; +} + +/** + * ktime_to_timeval - convert a ktime_t variable to timeval format + * + * @tv: pointer to timeval variable to store result + * @kt: the ktime_t variable to convert + * + * Stores the timeval representation of the ktime value in + * the timeval variable pointed to by @tv + */ +static inline void ktime_to_timeval(struct timeval *tv, ktime_t kt) +{ + tv->tv_sec = (time_t) kt.tv.sec; + tv->tv_usec = (suseconds_t) (kt.tv.nsec / NSEC_PER_USEC); +} + +/** + * ktime_to_clock_t - convert a ktime_t variable to clock_t format + * @kt: the ktime_t variable to convert + * + * Returns a clock_t variable with the converted value + */ +static inline clock_t ktime_to_clock_t(ktime_t kt) +{ + return nsec_to_clock_t( (u64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec); +} + +/** + * ktime_to_ns - convert a ktime_t variable to scalar nanoseconds + * @kt: the ktime_t variable to convert + * + * Returns the scalar nanoseconds representation of kt + */ +static inline u64 ktime_to_ns(ktime_t kt) +{ + return (u64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec; +} + +/* + * Calc ktime_t modulo div. + * div is less than NSEC_PER_SEC and (NSEC_PER_SEC % div) = 0 ! + */ +#define ktime_modulo(kt, div) ((unsigned long)kt.tv.nsec % div) + +#endif + +/* + * The resolution of the clocks. The resolution value is returned in + * the clock_getres() system call to give application programmers an + * idea of the (in)accuracy of timers. Timer values are rounded up to + * this resolution values. + */ +#define KTIME_LOW_RES (NSEC_PER_SEC/HZ) + +#ifdef CONFIG_GENERIC_TIME + +#define ktime_get get_monotonic_clock +#define ktime_get_real get_realtime_clock +#define ktime_get_ts(ts) get_monotonic_clock_ts(ts) +#define ktime_get_real_ts(ts) get_realtime_clock_ts(ts) + +#else /* CONFIG_GENERIC_TIME */ + +/* Get the monotonic time in ktime_t format: */ +extern ktime_t ktime_get(void); + +/* Get the real (wall-) time in ktime_t format: */ +extern ktime_t ktime_get_real(void); + +/* Get the monotonic time in timespec format: */ +extern void ktime_get_ts(struct timespec *ts); + +/* Get the real (wall-) time in timespec format: */ +#define ktime_get_real_ts(ts) getnstimeofday(ts) + +#endif /* !CONFIG_GENERIC_TIME */ + +#endif Index: linux-2.6.14/include/linux/ktimer.h =================================================================== --- /dev/null +++ linux-2.6.14/include/linux/ktimer.h @@ -0,0 +1,213 @@ +/* + * include/linux/ktimer.h + * + * ktimers - high-precision kernel timers + * + * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar + * + * data type definitions, declarations, prototypes + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_KTIMER_H +#define _LINUX_KTIMER_H + +#include +#include +#include +#include +#include + +/* + * Mode arguments of xxx_ktimer functions: + */ +enum ktimer_rearm { + KTIMER_ABS = 1, /* Time value is absolute */ + KTIMER_REL, /* Time value is relative to now */ + KTIMER_INCR, /* Time value is relative to previous expiry time */ + KTIMER_FORWARD, /* Timer is rearmed with value. Overruns accounted */ + KTIMER_REARM, /* Timer is rearmed with interval. Overruns accounted */ + KTIMER_RESTART, /* Timer is restarted with the stored expiry value */ + + /* + * Expiry must not be checked when the timer is started: + * (can be OR-ed with another above mode flag) + */ + KTIMER_NOCHECK = 0x10000, + /* + * Rounding is required when the time is set up. Thats an + * optimization for relative timers as we read current time + * in the enqueing code so we do not need to read is twice. + */ + KTIMER_ROUND = 0x20000, + + /* (used internally: no rearming) */ + KTIMER_NOREARM = 0 +}; + +/* + * Timer states: + */ +enum ktimer_state { + KTIMER_INACTIVE, /* Timer is inactive */ + KTIMER_PENDING, /* Timer is pending */ + KTIMER_EXPIRED, /* Timer is expired and queued in the rbtree */ + KTIMER_EXPIRED_NOQUEUE, /* Timer is expired and not queued in the rbtree */ +}; + +struct ktimer_base; + +/** + * struct ktimer - the basic ktimer structure + * + * @node: red black tree node for time ordered insertion + * @list: list head for easier access to the time ordered list, + * without walking the red black tree. + * @expires: the absolute expiry time in the ktimers internal + * representation. The time is related to the clock on + * which the timer is based. + * @expired: the absolute time when the timer expired. Used for + * simplifying return path calculations and for debugging + * purposes. + * @interval: the timer interval for automatic rearming + * @overrun: the number of intervals missed when rearming a timer + * @state: state of the timer + * @function: timer expiry callback function + * @data: argument for the callback function + * @base: pointer to the timer base (per cpu and per clock) + * + * The ktimer structure must be initialized by init_ktimer_#CLOCKTYPE() + */ +struct ktimer { + struct rb_node node; + struct list_head list; + ktime_t expires; + ktime_t expired; + ktime_t interval; + int overrun; + enum ktimer_state state; + void (*function)(void *); + void *data; + struct ktimer_base *base; +}; + +/** + * struct ktimer_base - the timer base for a specific clock + * + * @index: clock type index for per_cpu support when moving a timer + * to a base on another cpu. + * @lock: lock protecting the base and associated timers + * @active: red black tree root node for the active timers + * @pending: list of pending timers for simple time ordered access + * @count: the number of active timers + * @resolution: the resolution of the clock, in nanoseconds + * @get_time: function to retrieve the current time of the clock + * @curr_timer: the timer which is executing a callback right now + * @wait: waitqueue to wait for a currently running timer + * @name: string identifier of the clock + */ +struct ktimer_base { + int index; + spinlock_t lock; + struct rb_root active; + struct list_head pending; + int count; + unsigned long resolution; + ktime_t (*get_time)(void); + struct ktimer *curr_timer; + wait_queue_head_t wait; +#ifdef CONFIG_HIGH_RES_TIMERS + struct list_head expired; + ktime_t (*getoffset)(void); + int (*reprogram)(struct ktimer *t, + struct ktimer_base *b, ktime_t n); +#endif + char *name; +}; + +#define KTIMER_POISON ((void *) 0x00100101) + +#ifdef CONFIG_HIGH_RES_TIMERS + +extern void ktimer_clock_notify(void); +extern void clock_was_set(void); +extern int ktimer_interrupt(void); + +/* + * The resolution of the clocks. The resolution value is returned in + * the clock_getres() system call to give application programmers an + * idea of the (in)accuracy of timers. Timer values are rounded up to + * this resolution values. + */ +#define KTIME_REALTIME_RES CONFIG_HIGH_RES_RESOLUTION +#define KTIME_MONOTONIC_RES CONFIG_HIGH_RES_RESOLUTION + +#else + +#define KTIME_REALTIME_RES KTIME_LOW_RES +#define KTIME_MONOTONIC_RES KTIME_LOW_RES + +/* + * clock_was_set() is a NOP for non- high-resolution systems. The + * time-sorted order guarantees that a timer does not expire early and + * is expired in the next softirq when the clock was advanced. + */ +#define clock_was_set() do { } while (0) +#define ktimer_clock_notify() do { } while (0) + +static inline int ktimer_interrupt(void) +{ + return 0; +} + +#endif + +/* Exported timer functions: */ + +/* Initialize timers: */ +extern void ktimer_init(struct ktimer *timer); +extern void ktimer_init_real(struct ktimer *timer); + +/* Basic timer operations: */ +extern int ktimer_start(struct ktimer *timer, ktime_t *tim, int mode); +extern int ktimer_restart(struct ktimer *timer, ktime_t *tim, int mode); +extern int ktimer_cancel(struct ktimer *timer); +extern int ktimer_try_to_cancel(struct ktimer *timer); + +/* Query timers: */ +extern ktime_t ktimer_get_remtime(struct ktimer *timer); +extern ktime_t ktimer_get_expiry(struct ktimer *timer, ktime_t *now); +extern int ktimer_get_res(clockid_t which_clock, struct timespec *tp); +extern int ktimer_get_res_real(clockid_t which_clock, struct timespec *tp); + +static inline int ktimer_active(struct ktimer *timer) +{ + return timer->state != KTIMER_INACTIVE; +} + +/* Convert with rounding based on resolution of timer's clock: */ +extern ktime_t ktimer_round_timeval(struct ktimer *timer, struct timeval *tv); +extern ktime_t ktimer_round_timespec(struct ktimer *timer, struct timespec *ts); + +/* Precise sleep: */ +extern long ktimer_nanosleep(struct timespec *rqtp, + struct timespec __user *rmtp, int mode); +extern long ktimer_nanosleep_real(struct timespec *rqtp, + struct timespec __user *rmtp, int mode); + +#ifdef CONFIG_SMP +extern void wait_for_ktimer(struct ktimer *timer); +#else +# define wait_for_ktimer(t) do { } while (0) +#endif + +/* Soft interrupt function to run the ktimer queues: */ +extern void ktimer_run_queues(void); + +/* Bootup initialization: */ +extern void __init ktimers_init(void); + +#endif Index: linux-2.6.14/include/linux/posix-timers.h =================================================================== --- linux-2.6.14.orig/include/linux/posix-timers.h +++ linux-2.6.14/include/linux/posix-timers.h @@ -51,10 +51,9 @@ struct k_itimer { struct sigqueue *sigq; /* signal queue entry. */ union { struct { - struct timer_list timer; - struct list_head abs_timer_entry; /* clock abs_timer_list */ - struct timespec wall_to_prev; /* wall_to_monotonic used when set */ - unsigned long incr; /* interval in jiffies */ + struct ktimer timer; + ktime_t incr; + int overrun; } real; struct cpu_timer_list cpu; struct { @@ -66,10 +65,6 @@ struct k_itimer { } it; }; -struct k_clock_abs { - struct list_head list; - spinlock_t lock; -}; struct k_clock { int res; /* in nano seconds */ int (*clock_getres) (clockid_t which_clock, struct timespec *tp); @@ -77,7 +72,7 @@ struct k_clock { int (*clock_set) (clockid_t which_clock, struct timespec * tp); int (*clock_get) (clockid_t which_clock, struct timespec * tp); int (*timer_create) (struct k_itimer *timer); - int (*nsleep) (clockid_t which_clock, int flags, struct timespec *); + int (*nsleep) (clockid_t which_clock, int flags, struct timespec *, struct timespec __user *); int (*timer_set) (struct k_itimer * timr, int flags, struct itimerspec * new_setting, struct itimerspec * old_setting); @@ -91,37 +86,104 @@ void register_posix_clock(clockid_t cloc /* Error handlers for timer_create, nanosleep and settime */ int do_posix_clock_notimer_create(struct k_itimer *timer); -int do_posix_clock_nonanosleep(clockid_t, int flags, struct timespec *); +int do_posix_clock_nonanosleep(clockid_t, int flags, struct timespec *, struct timespec __user *); int do_posix_clock_nosettime(clockid_t, struct timespec *tp); /* function to call to trigger timer event */ int posix_timer_event(struct k_itimer *timr, int si_private); -struct now_struct { - unsigned long jiffies; -}; - -#define posix_get_now(now) (now)->jiffies = jiffies; -#define posix_time_before(timer, now) \ - time_before((timer)->expires, (now)->jiffies) - -#define posix_bump_timer(timr, now) \ - do { \ - long delta, orun; \ - delta = now.jiffies - (timr)->it.real.timer.expires; \ - if (delta >= 0) { \ - orun = 1 + (delta / (timr)->it.real.incr); \ - (timr)->it.real.timer.expires += \ - orun * (timr)->it.real.incr; \ - (timr)->it_overrun += orun; \ - } \ - }while (0) +#if (BITS_PER_LONG < 64) +static inline ktime_t forward_posix_timer(struct k_itimer *t, ktime_t now) +{ + ktime_t delta = ktime_sub(now, t->it.real.timer.expires); + unsigned long orun = 1; + + if (ktime_cmp_val(delta, <, KTIME_ZERO)) + goto out; + + if (unlikely(ktime_cmp(delta, >, t->it.real.incr))) { + + int sft = 0; + u64 div, dclc, inc, dns; + + dclc = dns = ktime_to_ns(delta); + div = inc = ktime_to_ns(t->it.real.incr); + /* Make sure the divisor is less than 2^32 */ + while(div >> 32) { + sft++; + div >>= 1; + } + dclc >>= sft; + do_div(dclc, (unsigned long) div); + orun = (unsigned long) dclc; + if (likely(!(inc >> 32))) + dclc *= (unsigned long) inc; + else + dclc *= inc; + t->it.real.timer.expires = ktime_add_ns(t->it.real.timer.expires, + dclc); + } else { + t->it.real.timer.expires = ktime_add(t->it.real.timer.expires, + t->it.real.incr); + } + /* + * Here is the correction for exact. Also covers delta == incr + * which is the else clause above. + */ + if (ktime_cmp(t->it.real.timer.expires, <=, now)) { + t->it.real.timer.expires = ktime_add(t->it.real.timer.expires, + t->it.real.incr); + orun++; + } + t->it_overrun += orun; + + out: + return ktime_sub(t->it.real.timer.expires, now); +} +#else +static inline ktime_t forward_posix_timer(struct k_itimer *t, ktime_t now) +{ + ktime_t delta = ktime_sub(now, t->it.real.timer.expires); + unsigned long orun = 1; + + if (ktime_cmp_val(delta, <, KTIME_ZERO)) + goto out; + + if (unlikely(ktime_cmp(delta, >, t->it.real.incr))) { + + u64 dns, inc; + + dns = ktime_to_ns(delta); + inc = ktime_to_ns(t->it.real.incr); + + orun = dns / inc; + t->it.real.timer.expires = ktime_add_ns(t->it.real.timer.expires, + orun * inc); + } else { + t->it.real.timer.expires = ktime_add(t->it.real.timer.expires, + t->it.real.incr); + } + /* + * Here is the correction for exact. Also covers delta == incr + * which is the else clause above. + */ + if (ktime_cmp(t->it.real.timer.expires, <=, now)) { + t->it.real.timer.expires = ktime_add(t->it.real.timer.expires, + t->it.real.incr); + orun++; + } + t->it_overrun += orun; + out: + return ktime_sub(t->it.real.timer.expires, now); +} +#endif int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *); int posix_cpu_clock_get(clockid_t which_clock, struct timespec *); int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp); int posix_cpu_timer_create(struct k_itimer *); -int posix_cpu_nsleep(clockid_t, int, struct timespec *); +int posix_cpu_nsleep(clockid_t, int, struct timespec *, + struct timespec __user *); int posix_cpu_timer_set(struct k_itimer *, int, struct itimerspec *, struct itimerspec *); int posix_cpu_timer_del(struct k_itimer *); Index: linux-2.6.14/include/linux/sched.h =================================================================== --- linux-2.6.14.orig/include/linux/sched.h +++ linux-2.6.14/include/linux/sched.h @@ -104,6 +104,7 @@ extern unsigned long nr_iowait(void); #include #include #include +#include #include @@ -358,8 +359,7 @@ struct signal_struct { struct list_head posix_timers; /* ITIMER_REAL timer for the process */ - struct timer_list real_timer; - unsigned long it_real_value, it_real_incr; + struct ktimer real_timer; /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */ cputime_t it_prof_expires, it_virt_expires; Index: linux-2.6.14/include/linux/time.h =================================================================== --- linux-2.6.14.orig/include/linux/time.h +++ linux-2.6.14/include/linux/time.h @@ -4,6 +4,7 @@ #include #ifdef __KERNEL__ +#include #include #endif @@ -27,6 +28,10 @@ struct timezone { #ifdef __KERNEL__ +/* timeofday base types */ +typedef s64 nsec_t; +typedef u64 cycle_t; + /* Parameters used to convert the timespec values */ #define MSEC_PER_SEC (1000L) #define USEC_PER_SEC (1000000L) @@ -38,38 +43,13 @@ static __inline__ int timespec_equal(str return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } -/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. - * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 - * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. - * - * [For the Julian calendar (which was used in Russia before 1917, - * Britain & colonies before 1752, anywhere else before 1582, - * and is still in use by some communities) leave out the - * -year/100+year/400 terms, and add 10.] - * - * This algorithm was first published by Gauss (I think). - * - * WARNING: this function will overflow on 2106-02-07 06:28:16 on - * machines were long is 32-bit! (However, as time_t is signed, we - * will already get problems at other places on 2038-01-19 03:14:08) - */ -static inline unsigned long +#define timespec_valid(ts) \ +(((ts)->tv_sec >= 0) && (((unsigned) (ts)->tv_nsec) < NSEC_PER_SEC)) + +extern unsigned long mktime (unsigned int year, unsigned int mon, unsigned int day, unsigned int hour, - unsigned int min, unsigned int sec) -{ - if (0 >= (int) (mon -= 2)) { /* 1..12 -> 11,12,1..10 */ - mon += 12; /* Puts Feb last since it has leap day */ - year -= 1; - } - - return ((( - (unsigned long) (year/4 - year/100 + year/400 + 367*mon/12 + day) + - year*365 - 719499 - )*24 + hour /* now have hours */ - )*60 + min /* now have minutes */ - )*60 + sec; /* finally seconds */ -} + unsigned int min, unsigned int sec); extern struct timespec xtime; extern struct timespec wall_to_monotonic; @@ -80,6 +60,8 @@ static inline unsigned long get_seconds( return xtime.tv_sec; } +extern void set_normalized_timespec (struct timespec *ts, time_t sec, long nsec); + struct timespec current_kernel_time(void); #define CURRENT_TIME (current_kernel_time()) @@ -88,29 +70,64 @@ struct timespec current_kernel_time(void extern void do_gettimeofday(struct timeval *tv); extern int do_settimeofday(struct timespec *tv); extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz); -extern void clock_was_set(void); // call when ever the clock is set -extern int do_posix_clock_monotonic_gettime(struct timespec *tp); +extern void do_posix_clock_monotonic_gettime(struct timespec *ts); extern long do_utimes(char __user * filename, struct timeval * times); struct itimerval; extern int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue); extern int do_getitimer(int which, struct itimerval *value); -extern void getnstimeofday (struct timespec *tv); extern struct timespec timespec_trunc(struct timespec t, unsigned gran); -static inline void -set_normalized_timespec (struct timespec *ts, time_t sec, long nsec) + +/** + * timespec_to_ns - Convert timespec to nanoseconds + * @ts: pointer to the timespec variable to be converted + * + * Returns the scalar nanosecond representation of the timespec + * variable + */ +static inline nsec_t timespec_to_ns(struct timespec *ts) { - while (nsec > NSEC_PER_SEC) { - nsec -= NSEC_PER_SEC; - ++sec; + nsec_t res = (nsec_t) ts->tv_sec * NSEC_PER_SEC; + + return res + (nsec_t) ts->tv_nsec; +} + +/** + * timeval_to_ns - Convert timeval to nanoseconds + * @ts: pointer to the timeval variable to be converted + * + * Returns the scalar nanosecond representation of the timeval + * variable + */ +static inline nsec_t timeval_to_ns(struct timeval *tv) +{ + nsec_t res = (nsec_t) tv->tv_sec * NSEC_PER_SEC; + + return res + (nsec_t) tv->tv_usec * NSEC_PER_USEC; +} + +extern void ns_to_timespec(struct timespec *ts, nsec_t nsec); +extern void ns_to_timeval(struct timeval *tv, nsec_t nsec); + +static inline void normalize_timespec(struct timespec *ts) +{ + while ((unsigned long)ts->tv_nsec > NSEC_PER_SEC) { + ts->tv_nsec -= NSEC_PER_SEC; + ts->tv_sec++; } - while (nsec < 0) { - nsec += NSEC_PER_SEC; - --sec; +} + +static inline struct timespec timespec_add_ns(struct timespec a, nsec_t ns) +{ + while(ns > NSEC_PER_SEC) { + ns -= NSEC_PER_SEC; + a.tv_sec++; } - ts->tv_sec = sec; - ts->tv_nsec = nsec; + a.tv_nsec += ns; + normalize_timespec(&a); + + return a; } #endif /* __KERNEL__ */ @@ -145,23 +162,18 @@ struct itimerval { /* * The IDs of the various system clocks (for POSIX.1b interval timers). */ -#define CLOCK_REALTIME 0 -#define CLOCK_MONOTONIC 1 +#define CLOCK_REALTIME 0 +#define CLOCK_MONOTONIC 1 #define CLOCK_PROCESS_CPUTIME_ID 2 #define CLOCK_THREAD_CPUTIME_ID 3 -#define CLOCK_REALTIME_HR 4 -#define CLOCK_MONOTONIC_HR 5 /* * The IDs of various hardware clocks */ - - #define CLOCK_SGI_CYCLE 10 #define MAX_CLOCKS 16 -#define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC | \ - CLOCK_REALTIME_HR | CLOCK_MONOTONIC_HR) -#define CLOCKS_MONO (CLOCK_MONOTONIC & CLOCK_MONOTONIC_HR) +#define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC) +#define CLOCKS_MONO (CLOCK_MONOTONIC) /* * The various flags for setting POSIX.1b interval timers. Index: linux-2.6.14/include/linux/timer.h =================================================================== --- linux-2.6.14.orig/include/linux/timer.h +++ linux-2.6.14/include/linux/timer.h @@ -91,6 +91,6 @@ static inline void add_timer(struct time extern void init_timers(void); extern void run_local_timers(void); -extern void it_real_fn(unsigned long); +extern void it_real_fn(void *); #endif Index: linux-2.6.14/init/main.c =================================================================== --- linux-2.6.14.orig/init/main.c +++ linux-2.6.14/init/main.c @@ -47,6 +47,8 @@ #include #include #include +#include +#include #include #include @@ -484,8 +486,11 @@ asmlinkage void __init start_kernel(void rcu_init(); init_IRQ(); pidhash_init(); + init_clockevents(); init_timers(); + ktimers_init(); softirq_init(); + timeofday_init(); time_init(); /* Index: linux-2.6.14/kernel/Makefile =================================================================== --- linux-2.6.14.orig/kernel/Makefile +++ linux-2.6.14/kernel/Makefile @@ -7,8 +7,10 @@ obj-y = sched.o fork.o exec_domain.o sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o intermodule.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \ + ktimers.o +obj-$(CONFIG_GENERIC_TIME) += time/ obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o Index: linux-2.6.14/kernel/exit.c =================================================================== --- linux-2.6.14.orig/kernel/exit.c +++ linux-2.6.14/kernel/exit.c @@ -842,7 +842,7 @@ fastcall NORET_TYPE void do_exit(long co update_mem_hiwater(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { - del_timer_sync(&tsk->signal->real_timer); + ktimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); acct_process(code); } Index: linux-2.6.14/kernel/fork.c =================================================================== --- linux-2.6.14.orig/kernel/fork.c +++ linux-2.6.14/kernel/fork.c @@ -804,10 +804,9 @@ static inline int copy_signal(unsigned l init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); - sig->it_real_value = sig->it_real_incr = 0; + ktimer_init(&sig->real_timer); sig->real_timer.function = it_real_fn; - sig->real_timer.data = (unsigned long) tsk; - init_timer(&sig->real_timer); + sig->real_timer.data = tsk; sig->it_virt_expires = cputime_zero; sig->it_virt_incr = cputime_zero; Index: linux-2.6.14/kernel/itimer.c =================================================================== --- linux-2.6.14.orig/kernel/itimer.c +++ linux-2.6.14/kernel/itimer.c @@ -12,36 +12,49 @@ #include #include #include +#include #include -static unsigned long it_real_value(struct signal_struct *sig) +/** + * itimer_get_remtime - get remaining time for the timer + * + * @timer: the timer to read + * @fake: a pending, but expired timer returns fake (itimers kludge) + * + * Returns the delta between the expiry time and now, which can be + * less than zero or the fake value described above. + */ +static ktime_t itimer_get_remtime(struct ktimer *timer, long fake) { - unsigned long val = 0; - if (timer_pending(&sig->real_timer)) { - val = sig->real_timer.expires - jiffies; - - /* look out for negative/zero itimer.. */ - if ((long) val <= 0) - val = 1; - } - return val; + ktime_t rem = ktimer_get_remtime(timer); + + /* + * Racy but safe: if the itimer expires after the above + * ktimer_get_remtime() call but before this condition + * then we return KTIMER_ZERO - which is correct. + */ + if (ktimer_active(timer)) { + if (ktime_cmp_val(rem, <=, KTIME_ZERO)) + rem = ktime_set(0, fake); + } else + ktime_set_scalar(rem, KTIME_ZERO); + + return rem; } int do_getitimer(int which, struct itimerval *value) { struct task_struct *tsk = current; - unsigned long interval, val; + ktime_t interval, val; cputime_t cinterval, cval; switch (which) { case ITIMER_REAL: - spin_lock_irq(&tsk->sighand->siglock); - interval = tsk->signal->it_real_incr; - val = it_real_value(tsk->signal); - spin_unlock_irq(&tsk->sighand->siglock); - jiffies_to_timeval(val, &value->it_value); - jiffies_to_timeval(interval, &value->it_interval); + interval = tsk->signal->real_timer.interval; + val = itimer_get_remtime(&tsk->signal->real_timer, NSEC_PER_USEC); + ktime_to_timeval(&value->it_value, val); + ktime_to_timeval(&value->it_interval, interval); break; case ITIMER_VIRTUAL: read_lock(&tasklist_lock); @@ -113,59 +126,36 @@ asmlinkage long sys_getitimer(int which, } -void it_real_fn(unsigned long __data) +/* + * The timer is automagically restarted, when interval != 0 + */ +void it_real_fn(void *data) { - struct task_struct * p = (struct task_struct *) __data; - unsigned long inc = p->signal->it_real_incr; - - send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); - - /* - * Now restart the timer if necessary. We don't need any locking - * here because do_setitimer makes sure we have finished running - * before it touches anything. - * Note, we KNOW we are (or should be) at a jiffie edge here so - * we don't need the +1 stuff. Also, we want to use the prior - * expire value so as to not "slip" a jiffie if we are late. - * Deal with requesting a time prior to "now" here rather than - * in add_timer. - */ - if (!inc) - return; - while (time_before_eq(p->signal->real_timer.expires, jiffies)) - p->signal->real_timer.expires += inc; - add_timer(&p->signal->real_timer); + send_group_sig_info(SIGALRM, SEND_SIG_PRIV, data); } int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) { struct task_struct *tsk = current; - unsigned long val, interval, expires; + struct ktimer *timer; + ktime_t expires; cputime_t cval, cinterval, nval, ninterval; switch (which) { case ITIMER_REAL: -again: - spin_lock_irq(&tsk->sighand->siglock); - interval = tsk->signal->it_real_incr; - val = it_real_value(tsk->signal); - /* We are sharing ->siglock with it_real_fn() */ - if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) { - spin_unlock_irq(&tsk->sighand->siglock); - goto again; - } - tsk->signal->it_real_incr = - timeval_to_jiffies(&value->it_interval); - expires = timeval_to_jiffies(&value->it_value); - if (expires) - mod_timer(&tsk->signal->real_timer, - jiffies + 1 + expires); - spin_unlock_irq(&tsk->sighand->siglock); + timer = &tsk->signal->real_timer; + ktimer_cancel(timer); if (ovalue) { - jiffies_to_timeval(val, &ovalue->it_value); - jiffies_to_timeval(interval, - &ovalue->it_interval); - } + ktime_to_timeval(&ovalue->it_value, + itimer_get_remtime(timer, NSEC_PER_USEC)); + ktime_to_timeval(&ovalue->it_interval, timer->interval); + } + timer->interval = ktimer_round_timeval(timer, + &value->it_interval); + expires = timeval_to_ktime(value->it_value); + if (ktime_cmp_val(expires, != , KTIME_ZERO)) + ktimer_restart(timer, &expires, + KTIMER_REL | KTIMER_NOCHECK | KTIMER_ROUND); break; case ITIMER_VIRTUAL: nval = timeval_to_cputime(&value->it_value); Index: linux-2.6.14/kernel/ktimers.c =================================================================== --- /dev/null +++ linux-2.6.14/kernel/ktimers.c @@ -0,0 +1,1477 @@ +/* + * linux/kernel/ktimers.c + * + * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar + * + * High-precision kernel timers + * + * In contrast to the low-resolution timeout API implemented in + * kernel/timer.c, ktimers provide finer resolution and accuracy + * depending on system configuration and capabilities. + * + * These timers are currently used for: + * - itimers + * - POSIX timers + * - nanosleep + * - precise in-kernel timing + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * Credits: + * based on kernel/timer.c + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_HIGH_RES_TIMERS +static int hrtimer_common_reprogram(struct ktimer *timer, + struct ktimer_base *base, ktime_t now); +#endif + +/* + * The timer bases: + */ + +#define MAX_KTIMER_BASES 2 + +static DEFINE_PER_CPU(struct ktimer_base, ktimer_bases[MAX_KTIMER_BASES]) = +{ + { + .index = CLOCK_REALTIME, + .name = "Realtime", + .get_time = &ktime_get_real, + .resolution = KTIME_REALTIME_RES, +#ifdef CONFIG_HIGH_RES_TIMERS + .reprogram = &hrtimer_common_reprogram, + .getoffset = &get_realtime_offset, +#endif + }, + { + .index = CLOCK_MONOTONIC, + .name = "Monotonic", + .get_time = &ktime_get, + .resolution = KTIME_MONOTONIC_RES, +#ifdef CONFIG_HIGH_RES_TIMERS + .reprogram = &hrtimer_common_reprogram, +#endif + }, +}; + +#ifndef CONFIG_GENERIC_TIME + +/** + * ktime_get - get the monotonic time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get(void) +{ + struct timespec now; + + ktime_get_ts(&now); + + return timespec_to_ktime(now); +} + +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_real - get the real (wall-) time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get_real(void) +{ + struct timespec now; + + getnstimeofday(&now); + + return timespec_to_ktime(now); +} + +EXPORT_SYMBOL_GPL(ktime_get_real); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + struct timespec tomono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + getnstimeofday(ts); + tomono = wall_to_monotonic; + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec); +} +#endif + +/* + * Functions and macros which are different for UP/SMP systems are kept in a + * single place + */ +#ifdef CONFIG_SMP + +#define set_curr_timer(b, t) (b)->curr_timer = (t) +#define wake_up_timer_waiters(b) wake_up(&(b)->wait) + +/** + * wait_for_ktimer - Wait for a running ktimer + * + * @timer: timer to wait for + * + * The function waits in case the timers callback function is + * currently executed on the waitqueue of the timer base. The + * waitqueue is woken up after the timer callback function has + * finished execution. + */ +void wait_for_ktimer(struct ktimer *timer) +{ + struct ktimer_base *base = timer->base; + + if (base) + wait_event(base->wait, + base->curr_timer != timer); +} + +/* + * We are using hashed locking: holding per_cpu(ktimer_bases)[n].lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on the lists/queues. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = NULL and drop the lock: the timer remains + * locked. + */ +static struct ktimer_base *lock_ktimer_base(struct ktimer *timer, + unsigned long *flags) +{ + struct ktimer_base *base; + + for (;;) { + base = timer->base; + if (likely(base != NULL)) { + spin_lock_irqsave(&base->lock, *flags); + if (likely(base == timer->base)) + return base; + /* The timer has migrated to another CPU */ + spin_unlock_irqrestore(&base->lock, *flags); + } + cpu_relax(); + } +} + +/* + * Switch the timer base to the current CPU when possible. + */ +static inline struct ktimer_base * +switch_ktimer_base(struct ktimer *timer, struct ktimer_base *base) +{ + struct ktimer_base *new_base; + + new_base = &__get_cpu_var(ktimer_bases[base->index]); + + if (base != new_base) { + /* + * We are trying to schedule the timer on the local CPU. + * However we can't change timer's base while it is running, + * so we keep it on the same CPU. No hassle vs. reprogramming + * the event source in the high resolution case. The softirq + * code will take care of this when the timer function has + * completed. There is no conflict as we hold the lock until + * the timer is enqueued. + */ + if (unlikely(base->curr_timer == timer)) + return base; + + /* See the comment in lock_timer_base() */ + timer->base = NULL; + spin_unlock(&base->lock); + spin_lock(&new_base->lock); + timer->base = new_base; + } + return new_base; +} + +/* + * Get the timer base unlocked + * + * Take care of timer->base = NULL in switch_ktimer_base ! + */ +static inline struct ktimer_base * +get_ktimer_base_unlocked(struct ktimer *timer) +{ + struct ktimer_base *base; + + while (!(base = timer->base)) + cpu_relax(); + + return base; +} + +#else /* CONFIG_SMP */ + +#define set_curr_timer(b, t) do { } while (0) +#define wake_up_timer_waiters(b) do { } while (0) + +static inline struct ktimer_base * +lock_ktimer_base(struct ktimer *timer, unsigned long *flags) +{ + struct ktimer_base *base = timer->base; + + spin_lock_irqsave(&base->lock, *flags); + + return base; +} + +#define switch_ktimer_base(t, b) (b) +#define get_ktimer_base_unlocked(t) (t)->base + +#endif /* !CONFIG_SMP */ + +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + +#define ktimer_hres_active (__get_cpu_var(ktimer_hres).active) + +struct ktimer_hres { + ktime_t expires_next; + ktime_t next_tick; + ktime_t tick_incr; + int active; + int dotick; + unsigned long check_clocks; +}; + +DEFINE_PER_CPU(struct ktimer_hres, ktimer_hres); + +/* + * Shared reprogramming for clock_realtime and clock_monotonic + * + * When a new expires first timer is enqueued, we have + * to check, whether it expires earlier than the timer + * for which the hrt time source was armed. + * + * Called with interrupts disabled and base lock held + */ +static int hrtimer_common_reprogram(struct ktimer *timer, + struct ktimer_base *base, ktime_t now) +{ + ktime_t *expires_next = &__get_cpu_var(ktimer_hres).expires_next; + ktime_t expires = timer->expires; + int res; + + if (base->getoffset) + expires = ktime_sub(expires, base->getoffset()); + + if (ktime_cmp(expires, >= ,*expires_next)) + return 0; + + res = clockevents_set_next_event(expires, now); + if (!res) + *expires_next = expires; + return res; +} + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +int ktimer_interrupt(void) +{ + struct ktimer_base *base; + ktime_t expires_next, now; + int i, raise = 0, ret = 0; + int cpu = smp_processor_id(); + struct ktimer_hres *hres = &per_cpu(ktimer_hres, cpu); + + /* As long as we did not switch over to high resolution mode + * we expect, that the event source is running in periodic + * mode when it is a source serving other (tick based) + * functionality than next event + * + */ + if (!hres->active) + return 1; + + now = ktime_get(); + + if (hres->dotick) { + while (ktime_cmp(now, >= , hres->next_tick)) { + hres->next_tick = ktime_add(hres->next_tick, + hres->tick_incr); + ret++; + } + expires_next = hres->next_tick; + } else + ktime_set_scalar(expires_next, KTIME_MAX); + + base = per_cpu(ktimer_bases, cpu); + + for (i = 0; i < MAX_KTIMER_BASES; i++) { + ktime_t basenow; + DEFINE_KTIME(offset); + + spin_lock(&base->lock); + + if (list_empty(&base->pending)) { + spin_unlock(&base->lock); + base++; + continue; + } + + if (base->getoffset) { + offset = base->getoffset(); + basenow = ktime_add(now, offset); + } else { + basenow = now; + } + + while (!list_empty(&base->pending)) { + struct ktimer *timer = list_entry(base->pending.next, + struct ktimer, list); + + if (ktime_cmp(basenow, < , timer->expires)) { + ktime_t expires; + + expires = ktime_sub(timer->expires, offset); + if (ktime_cmp(expires, < , expires_next)) + expires_next = expires; + break; + } + timer->expired = basenow; + list_del(&timer->list); + list_add_tail(&timer->list, &base->expired); + timer->state = KTIMER_EXPIRED; + raise = 1; + } + spin_unlock(&base->lock); + base++; + } + + hres->expires_next = expires_next; + + /* Reprogramming necessary ? */ + if (ktime_cmp_val(expires_next, !=, KTIME_MAX)) + clockevents_set_next_event(expires_next, now); + + /* Raise softirq ? */ + if (raise) + raise_softirq(KTIMER_SOFTIRQ); + + return ret; +} + +/* + * Retrigger next event is called after clock was set + */ +void retrigger_next_event(void *arg) +{ + ktime_t expires_next, now; + int i, cpu = smp_processor_id(); + struct ktimer_base *base = per_cpu(ktimer_bases, cpu); + struct ktimer_hres *hres = &per_cpu(ktimer_hres, cpu); + + now = ktime_get(); + + if (hres->dotick) + expires_next = hres->next_tick; + else + ktime_set_scalar(expires_next, KTIME_MAX); + + for (i = 0; i < MAX_KTIMER_BASES; i++) { + ktime_t basenow; + DEFINE_KTIME(offset); + struct ktimer *timer; + + spin_lock(&base->lock); + + if (list_empty(&base->pending)) { + spin_unlock(&base->lock); + base++; + continue; + } + + if (base->getoffset) { + offset = base->getoffset(); + basenow = ktime_add(now, offset); + } else { + basenow = now; + } + timer = list_entry(base->pending.next, struct ktimer, list); + + if (ktime_cmp(basenow, < , timer->expires)) { + ktime_t expires; + + expires = ktime_sub(timer->expires, offset); + if (ktime_cmp(expires, < , expires_next)) + expires_next = expires; + } + spin_unlock(&base->lock); + base++; + } + + hres->expires_next = expires_next; + + /* Reprogramming necessary ? */ + if (ktime_cmp_val(expires_next, !=, KTIME_MAX)) + clockevents_set_next_event(expires_next, now); +} + +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. Called with xtime lock held ! + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ + preempt_disable(); + local_irq_disable(); + + if (ktimer_hres_active) { + retrigger_next_event(NULL); + local_irq_enable(); + + if (smp_call_function(retrigger_next_event, NULL, 1, 1)) + BUG(); + } else + local_irq_enable(); + preempt_enable(); +} + +/*** + * ktimer_clock_notify - A clock source or a clock event has been installed + * + * Notify the per cpu softirqs to recheck the clock sources and events + */ +void ktimer_clock_notify(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + set_bit(0, &per_cpu(ktimer_hres, i).check_clocks); +} + +/* + * A change in the clock source or clock events was detected. + * Check the clock source and the events, whether we can switch to + * high resolution mode or not. + * + * TODO: Handle the removal of clock sources / events + */ +static void ktimer_check_clocks(void) +{ + struct ktimer_hres *hres = &__get_cpu_var(ktimer_hres); + unsigned long flags; + int dotick; + + if (!test_and_clear_bit(0, &hres->check_clocks)) + return; + + if (!timeofday_is_continuous()) + return; + + if (!(dotick = clockevents_next_event_available())) + return; + + local_irq_save(flags); + clockevents_init_next_event(); + hres->active = 1; + if (dotick == CLOCK_EVT_SCHEDTICK) { + struct ktimer helper; + struct timespec tsnow; + ktime_t now; + + /* Adjust to resolution */ + ktimer_init(&helper); + ktime_get_ts(&tsnow); + now = ktimer_round_timespec(&helper, &tsnow); + hres->tick_incr = ktime_set(0, NSEC_PER_SEC/HZ); + hres->next_tick = ktime_add(now, hres->tick_incr); + hres->dotick = 1; + } else + hres->dotick = 0; + + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); + local_irq_restore(flags); + printk(KERN_INFO "Ktimers: Switched to high resolution mode CPU %d\n", + smp_processor_id()); +} + +/* + * For HRT we move expired timers directly to the expired list and set + * the status to KTIMER_EXPIRED_NOQUEUE + */ +static inline int hres_enqueue_expired(struct ktimer *timer, + struct ktimer_base *base, ktime_t now) +{ + timer->expired = now; + list_add_tail(&timer->list, &base->expired); + timer->state = KTIMER_EXPIRED_NOQUEUE; + base->count++; + raise_softirq(KTIMER_SOFTIRQ); + return 1; +} + +static inline void +hres_requeue_expired(struct ktimer *timer, struct ktimer_base *base) +{ + timer->expired = timer->expires; + list_del(&timer->list); + list_add_tail(&timer->list, &base->expired); + timer->state = KTIMER_EXPIRED; + raise_softirq(KTIMER_SOFTIRQ); +} + +#else +# define ktimer_hres_active 0 +# define hres_enqueue_expired(t,b,n) 0 +# define ktimer_check_clocks() do { } while (0) +#endif + +/* + * Functions for the union type storage format of ktime_t which are + * too large for inlining: + */ +#if (BITS_PER_LONG < 64) + +#ifndef CONFIG_KTIME_SCALAR +/** + * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable + * + * @kt: addend + * @nsec: the scalar nsec value to add + * + * Returns the sum of kt and nsec in ktime_t format + */ +ktime_t ktime_add_ns(ktime_t kt, u64 nsec) +{ + ktime_t tmp; + + if (likely(nsec < NSEC_PER_SEC)) { + tmp.tv64 = nsec; + } else { + unsigned long rem = do_div(nsec, NSEC_PER_SEC); + + tmp = ktime_set((long)nsec, rem); + } + + return ktime_add(kt, tmp); +} + +#else + +/** + * ktime_modulo - Calc ktime_t modulo div + * + * @kt: dividend + * @div. divisor + * + * Return ktime_t modulo div. + * + * div is less than NSEC_PER_SEC and (NSEC_PER_SEC % div) = 0 ! + */ +static unsigned long ktime_modulo(ktime_t kt, unsigned long div) +{ + return do_div(kt, div); +} + +#endif +#endif + +/* + * Counterpart to lock_timer_base above. + */ +static inline +void unlock_ktimer_base(struct ktimer *timer, unsigned long *flags) +{ + spin_unlock_irqrestore(&timer->base->lock, *flags); +} + +/** + * ktimer_round_timespec - convert timespec to ktime_t with resolution + * adjustment + * + * @timer: ktimer to retrieve the base + * @ts: pointer to the timespec value to be converted + * + * Returns the resolution adjusted ktime_t representation of the + * timespec. + * + * Note: We can access base without locking here, as ktimers can + * migrate between CPUs but can not be moved from one clock source to + * another. The clock source binding is set at init_ktimer_XXX time. + */ +ktime_t ktimer_round_timespec(struct ktimer *timer, struct timespec *ts) +{ + struct ktimer_base *base = get_ktimer_base_unlocked(timer); + long rem = ts->tv_nsec % base->resolution; + ktime_t t; + + t = ktime_set(ts->tv_sec, ts->tv_nsec); + + /* Check, if the value has to be rounded */ + if (rem) + t = ktime_add_ns(t, base->resolution - rem); + + return t; +} + +/** + * ktimer_round_timeval - convert timeval to ktime_t with resolution + * adjustment + * + * @timer: ktimer to retrieve the base + * @tv: pointer to the timeval value to be converted + * + * Returns the resolution adjusted ktime_t representation of the + * timeval. + */ +ktime_t ktimer_round_timeval(struct ktimer *timer, struct timeval *tv) +{ + struct timespec ts; + + ts.tv_sec = tv->tv_sec; + ts.tv_nsec = tv->tv_usec * NSEC_PER_USEC; + + return ktimer_round_timespec(timer, &ts); +} + +/* + * enqueue_ktimer - internal function to (re)start a timer + * + * The timer is inserted in expiry order. Insertion into the + * red black tree is O(log(n)). Must hold the base lock. + */ +static int enqueue_ktimer(struct ktimer *timer, struct ktimer_base *base, + ktime_t *tim, int mode) +{ + struct rb_node **link = &base->active.rb_node; + struct list_head *prev = &base->pending; + struct rb_node *parent = NULL; + struct ktimer *entry; + ktime_t now; + + /* Get current time */ + now = base->get_time(); + + /* + * Calculate the absolute expiry time based on the + * timer expiry mode: + */ + switch (mode & ~(KTIMER_NOCHECK | KTIMER_ROUND)) { + + case KTIMER_ABS: + timer->expires = *tim; + break; + + case KTIMER_REL: + timer->expires = ktime_add(now, *tim); + break; + + case KTIMER_INCR: + timer->expires = ktime_add(timer->expires, *tim); + break; + + case KTIMER_FORWARD: + while ktime_cmp(timer->expires, <= , now) { + timer->expires = ktime_add(timer->expires, *tim); + timer->overrun++; + } + goto nocheck; + + case KTIMER_REARM: + while ktime_cmp(timer->expires, <= , now) { + timer->expires = ktime_add(timer->expires, + timer->interval); + timer->overrun++; + } + goto nocheck; + + case KTIMER_RESTART: + break; + + default: + /* illegal mode */ + BUG(); + } + + /* + * Rounding is requested for one shot timers and the first + * event of interval timers. It's done here, so we don't + * have to read the current time twice for relative timers. + */ + if (mode & KTIMER_ROUND) { + unsigned long rem; + + rem = ktime_modulo(timer->expires, base->resolution); + if (rem) + timer->expires = ktime_add_ns(timer->expires, + base->resolution - rem); + } + + /* Expiry time in the past: */ + if (unlikely(ktime_cmp(timer->expires, <=, now))) { + timer->expired = now; + /* The caller takes care of expiry */ + if (!(mode & KTIMER_NOCHECK)) + return -1; + if (hres_enqueue_expired(timer, base, now)) + return 0; + } + nocheck: + + /* + * Find the right place in the rbtree: + */ + while (*link) { + parent = *link; + entry = rb_entry(parent, struct ktimer, node); + /* + * We dont care about collisions. Nodes with + * the same expiry time stay together. + */ + if (ktime_cmp(timer->expires, <, entry->expires)) + link = &(*link)->rb_left; + else { + link = &(*link)->rb_right; + prev = &entry->list; + } + } + + /* + * Insert the timer to the rbtree and to the sorted list: + */ + rb_link_node(&timer->node, parent, link); + rb_insert_color(&timer->node, &base->active); + if (ktimer_hres_active && prev != &base->pending) { + entry = list_entry(prev, struct ktimer, list); + if (entry->state != KTIMER_PENDING) + prev = &base->pending; + } + list_add(&timer->list, prev); + + timer->state = KTIMER_PENDING; + base->count++; + +#ifdef CONFIG_HIGH_RES_TIMERS + if (ktimer_hres_active && + base->pending.next == &timer->list && + base->reprogram && + base->reprogram(timer, base, now)) + hres_requeue_expired(timer, base); +#endif + return 0; +} + +/* + * __remove_ktimer - internal function to remove a timer + * + * The function also allows automatic rearming for interval timers. + * Must hold the base lock. + */ +static void +__remove_ktimer(struct ktimer *timer, struct ktimer_base *base, + enum ktimer_rearm rearm) +{ + /* + * Remove the timer from the sorted list and from the rbtree: + */ + list_del(&timer->list); + if (timer->state != KTIMER_EXPIRED_NOQUEUE) + rb_erase(&timer->node, &base->active); + timer->node.rb_parent = KTIMER_POISON; + + timer->state = KTIMER_INACTIVE; + base->count--; + BUG_ON(base->count < 0); + + /* Auto rearm the timer ? */ + if (rearm && ktime_cmp_val(timer->interval, !=, KTIME_ZERO)) + enqueue_ktimer(timer, base, NULL, KTIMER_REARM); +} + +/* + * remove ktimer, called with base lock held + */ +static inline int remove_ktimer(struct ktimer *timer, struct ktimer_base *base) +{ + if (ktimer_active(timer)) { + __remove_ktimer(timer, base, KTIMER_NOREARM); + return 1; + } + return 0; +} + +/* + * Internal function to (re)start a timer. + */ +static int +internal_restart_ktimer(struct ktimer *timer, ktime_t *tim, int mode) +{ + struct ktimer_base *base, *new_base; + unsigned long flags; + int ret; + + BUG_ON(!timer->function); + + base = lock_ktimer_base(timer, &flags); + + /* Remove an active timer from the queue */ + ret = remove_ktimer(timer, base); + + /* Switch the timer base, if necessary */ + new_base = switch_ktimer_base(timer, base); + + /* + * When the new timer setting is already expired, + * let the calling code deal with it. + */ + if (enqueue_ktimer(timer, new_base, tim, mode)) + ret = -1; + + unlock_ktimer_base(timer, &flags); + + return ret; +} + +/** + * ktimer_start - start a timer on the current CPU + * + * @timer: the timer to be added + * @tim: expiry time (optional, if not set in the timer) + * @mode: timer setup mode + * + * Returns: + * 0 on success + * -1 when the new time setting is already expired + */ +int ktimer_start(struct ktimer *timer, ktime_t *tim, int mode) +{ + BUG_ON(ktimer_active(timer)); + + return internal_restart_ktimer(timer, tim, mode); +} + +EXPORT_SYMBOL_GPL(ktimer_start); + +/** + * ktimer_restart - modify a running timer + * + * @timer: the timer to be modified + * @tim: expiry time (required) + * @mode: timer setup mode + * + * Returns: + * 0 when the timer was not active + * 1 when the timer was active + * -1 when the new time setting is already expired + */ +int ktimer_restart(struct ktimer *timer, ktime_t *tim, int mode) +{ + BUG_ON(!tim); + + return internal_restart_ktimer(timer, tim, mode); +} + +EXPORT_SYMBOL_GPL(ktimer_restart); + +/** + * ktimer_try_to_cancel - try to deactivate a timer + * + * @timer: ktimer to stop + * + * Returns: + * 0 when the timer was not active + * 1 when the timer was active + * -1 when the timer is currently excuting the callback function and + * can not be stopped + */ +int ktimer_try_to_cancel(struct ktimer *timer) +{ + struct ktimer_base *base; + unsigned long flags; + int ret = -1; + + base = lock_ktimer_base(timer, &flags); + + if (base->curr_timer != timer) { + ret = remove_ktimer(timer, base); + if (ret) + timer->expired = base->get_time(); + } + + unlock_ktimer_base(timer, &flags); + + return ret; + +} + +EXPORT_SYMBOL_GPL(ktimer_try_to_cancel); + +/** + * ktimer_cancel - cancel a timer and wait for the handler to finish. + * + * @timer: the timer to be cancelled + * + * Returns: + * 0 when the timer was not active + * 1 when the timer was active + */ +int ktimer_cancel(struct ktimer *timer) +{ + for (;;) { + int ret = ktimer_try_to_cancel(timer); + + if (ret >= 0) + return ret; + wait_for_ktimer(timer); + } +} + +EXPORT_SYMBOL_GPL(ktimer_cancel); + +/** + * ktimer_get_remtime - get remaining time for the timer + * + * @timer: the timer to read + * + * Returns the delta between the expiry time and now, which can be + * less than zero. + */ +ktime_t ktimer_get_remtime(struct ktimer *timer) +{ + struct ktimer_base *base; + unsigned long flags; + ktime_t rem; + + base = lock_ktimer_base(timer, &flags); + rem = ktime_sub(timer->expires, base->get_time()); + unlock_ktimer_base(timer, &flags); + + return rem; +} + +/** + * ktimer_get_expiry - get expiry time for the timer + * + * @timer: the timer to read + * @now: if != NULL then store current base->time into it + */ +ktime_t ktimer_get_expiry(struct ktimer *timer, ktime_t *now) +{ + struct ktimer_base *base; + unsigned long flags; + ktime_t expiry; + + base = lock_ktimer_base(timer, &flags); + expiry = timer->expires; + if (now) + *now = base->get_time(); + unlock_ktimer_base(timer, &flags); + + return expiry; +} + +/* + * Functions related to clock sources + */ + +static inline void ktimer_common_init(struct ktimer *timer) +{ + memset(timer, 0, sizeof(struct ktimer)); + timer->node.rb_parent = KTIMER_POISON; +} + +/** + * ktimer_init - initialize a timer to the monotonic clock + * + * @timer: the timer to be initialized + */ +void ktimer_init(struct ktimer *timer) +{ + struct ktimer_base *bases; + + ktimer_common_init(timer); + bases = per_cpu(ktimer_bases, raw_smp_processor_id()); + timer->base = &bases[CLOCK_MONOTONIC]; +} + +EXPORT_SYMBOL_GPL(ktimer_init); + +/** + * ktimer_init_real - initialize a timer to the real (wall-) clock + * + * @timer: the timer to be initialized + */ +void ktimer_init_real(struct ktimer *timer) +{ + struct ktimer_base *bases; + + ktimer_common_init(timer); + bases = per_cpu(ktimer_bases, raw_smp_processor_id()); + timer->base = &bases[CLOCK_REALTIME]; +} + +EXPORT_SYMBOL_GPL(ktimer_init_real); + +/** + * ktimer_get_res - get the monotonic timer resolution + * + * @which_clock: unused parameter for compability with the posix timer code + * @tp: pointer to timespec variable to store the resolution + * + * Store the resolution of clock monotonic in the variable pointed to + * by tp. + */ +int ktimer_get_res(clockid_t which_clock, struct timespec *tp) +{ + struct ktimer_base *bases; + + tp->tv_sec = 0; + bases = per_cpu(ktimer_bases, raw_smp_processor_id()); + tp->tv_nsec = bases[CLOCK_MONOTONIC].resolution; + + return 0; +} + +/** + * ktimer_get_res_real - get the real timer resolution + * + * @which_clock: unused parameter for compability with the posix timer code + * @tp: pointer to timespec variable to store the resolution + * + * Store the resolution of clock realtime in the variable pointed to + * by tp. + */ +int ktimer_get_res_real(clockid_t which_clock, struct timespec *tp) +{ + struct ktimer_base *bases; + + tp->tv_sec = 0; + bases = per_cpu(ktimer_bases, raw_smp_processor_id()); + tp->tv_nsec = bases[CLOCK_REALTIME].resolution; + + return 0; +} + +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * Expire the per base ktimer-queue in high resolution mode: + */ +static inline void run_ktimer_hres_queue(struct ktimer_base *base) +{ + spin_lock_irq(&base->lock); + + while (!list_empty(&base->expired)) { + struct ktimer *timer; + void (*fn)(void *); + void *data; + + timer = list_entry(base->expired.next, struct ktimer, list); + fn = timer->function; + data = timer->data; + __remove_ktimer(timer, base, KTIMER_REARM); + set_curr_timer(base, timer); + spin_unlock_irq(&base->lock); + + fn(data); + + spin_lock_irq(&base->lock); + set_curr_timer(base, NULL); + } + spin_unlock_irq(&base->lock); + + wake_up_timer_waiters(base); +} + +static void run_ktimer_softirq(struct softirq_action *h) +{ + struct ktimer_base *base = per_cpu(ktimer_bases, smp_processor_id()); + int i; + + for (i = 0; i < MAX_KTIMER_BASES; i++) + run_ktimer_hres_queue(&base[i]); +} + +#endif /* CONFIG_HIGH_RES_TIMERS */ + +/* + * Expire the per base ktimer-queue. Used for non HRT mode and + * as a fallback when HRT init failed: + */ +static inline void run_ktimer_queue(struct ktimer_base *base) +{ + ktime_t now = base->get_time(); + + spin_lock_irq(&base->lock); + + while (!list_empty(&base->pending)) { + struct ktimer *timer; + void (*fn)(void *); + void *data; + + timer = list_entry(base->pending.next, struct ktimer, list); + if (ktime_cmp(now, <=, timer->expires)) + break; + + timer->expired = now; + fn = timer->function; + data = timer->data; + set_curr_timer(base, timer); + __remove_ktimer(timer, base, KTIMER_REARM); + spin_unlock_irq(&base->lock); + + fn(data); + + spin_lock_irq(&base->lock); + set_curr_timer(base, NULL); + } + spin_unlock_irq(&base->lock); + + wake_up_timer_waiters(base); +} + +/* + * Called from timer softirq every jiffy, to expire ktimers. + * + * For HRT its the fall back code to run the softirq in the timer + * softirq context in case the hrtimer initialization failed or has + * not been done yet. + */ +void ktimer_run_queues(void) +{ + struct ktimer_base *base = __get_cpu_var(ktimer_bases); + int i; + + ktimer_check_clocks(); + + if (ktimer_hres_active) + return; + + for (i = 0; i < MAX_KTIMER_BASES; i++) + run_ktimer_queue(&base[i]); +} + +/* + * Sleep related functions: + */ + +/* + * Process-wakeup callback: + */ +static void ktimer_wake_up(void *data) +{ + wake_up_process(data); +} + +/** + * schedule_ktimer - sleep until timeout + * + * @timer: ktimer variable initialized with the correct clock base + * @t: timeout value + * @mode: timeout value is abs/rel + * + * Make the current task sleep until @timeout is + * elapsed. + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * will be returned + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + */ +static ktime_t __sched +schedule_ktimer(struct ktimer *timer, ktime_t *t, int mode) +{ + timer->data = current; + timer->function = ktimer_wake_up; + + if (unlikely(ktimer_start(timer, t, mode) < 0)) { + __set_current_state(TASK_RUNNING); + } else { + if (current->state != TASK_RUNNING) + schedule(); + ktimer_cancel(timer); + } + + /* Store the absolute expiry time */ + *t = timer->expires; + + /* Return the remaining time */ + return ktime_sub(timer->expires, timer->expired); +} + +static ktime_t __sched +schedule_ktimer_interruptible(struct ktimer *timer, ktime_t *t, int mode) +{ + set_current_state(TASK_INTERRUPTIBLE); + + return schedule_ktimer(timer, t, mode); +} + +static long __sched +nanosleep_restart(struct ktimer *timer, struct restart_block *restart) +{ + void *rfn_save = restart->fn; + struct timespec __user *rmtp; + struct timespec tu; + ktime_t t, rem; + + restart->fn = do_no_restart_syscall; + + t = ktime_set_low_high(restart->arg0, restart->arg1); + + rem = schedule_ktimer_interruptible(timer, &t, KTIMER_ABS); + + if (ktime_cmp_val(rem, <=, KTIME_ZERO)) + return 0; + + rmtp = (struct timespec __user *) restart->arg2; + ktime_to_timespec(&tu, rem); + if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) + return -EFAULT; + + restart->fn = rfn_save; + + /* The other values in restart are already filled in */ + return -ERESTART_RESTARTBLOCK; +} + +static long __sched nanosleep_restart_mono(struct restart_block *restart) +{ + struct ktimer timer; + + ktimer_init(&timer); + + return nanosleep_restart(&timer, restart); +} + +static long __sched nanosleep_restart_real(struct restart_block *restart) +{ + struct ktimer timer; + + ktimer_init_real(&timer); + + return nanosleep_restart(&timer, restart); +} + +static long __ktimer_nanosleep(struct ktimer *timer, struct timespec *rqtp, + struct timespec __user *rmtp, int mode, + long (*rfn)(struct restart_block *)) +{ + struct timespec tu; + ktime_t rem, t; + struct restart_block *restart; + + t = timespec_to_ktime(*rqtp); + + /* t is updated to absolute expiry time ! */ + rem = schedule_ktimer_interruptible(timer, &t, mode | KTIMER_ROUND); + + if (ktime_cmp_val(rem, <=, KTIME_ZERO)) + return 0; + + ktime_to_timespec(&tu, rem); + + if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) + return -EFAULT; + + restart = ¤t_thread_info()->restart_block; + restart->fn = rfn; + restart->arg0 = ktime_get_low(t); + restart->arg1 = ktime_get_high(t); + restart->arg2 = (unsigned long) rmtp; + + return -ERESTART_RESTARTBLOCK; +} + +long ktimer_nanosleep(struct timespec *rqtp, + struct timespec __user *rmtp, int mode) +{ + struct ktimer timer; + + ktimer_init(&timer); + + return __ktimer_nanosleep(&timer, rqtp, rmtp, mode, + nanosleep_restart_mono); +} + +long ktimer_nanosleep_real(struct timespec *rqtp, + struct timespec __user *rmtp, int mode) +{ + struct ktimer timer; + + ktimer_init_real(&timer); + return __ktimer_nanosleep(&timer, rqtp, rmtp, mode, + nanosleep_restart_real); +} + +asmlinkage long +sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) +{ + struct timespec tu; + + if (copy_from_user(&tu, rqtp, sizeof(tu))) + return -EFAULT; + + if (!timespec_valid(&tu)) + return -EINVAL; + + return ktimer_nanosleep(&tu, rmtp, KTIMER_REL); +} + +/* + * Functions related to boot-time initialization: + */ +static void __devinit init_ktimers_cpu(int cpu) +{ + struct ktimer_base *base = per_cpu(ktimer_bases, cpu); + int i; + + for (i = 0; i < MAX_KTIMER_BASES; i++) { + spin_lock_init(&base->lock); + INIT_LIST_HEAD(&base->pending); +#ifdef CONFIG_HIGH_RES_TIMERS + INIT_LIST_HEAD(&base->expired); +#endif + init_waitqueue_head(&base->wait); + base++; + } +#ifdef CONFIG_HIGH_RES_TIMERS + { + ktime_t max; + + ktime_set_scalar(max, KTIME_MAX); + per_cpu(ktimer_hres, cpu).expires_next = max; + set_bit(0, &per_cpu(ktimer_hres, cpu).check_clocks); + per_cpu(ktimer_hres, cpu).active = 0; + } +#endif +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void migrate_ktimer_list(struct ktimer_base *old_base, + struct ktimer_base *new_base) +{ + struct ktimer *timer; + struct rb_node *node; + + while ((node = rb_first(&old_base->active))) { + timer = rb_entry(node, struct ktimer, node); + remove_ktimer(timer, old_base); + timer->base = new_base; + enqueue_ktimer(timer, new_base, NULL, KTIMER_RESTART); + } +} + +static void migrate_ktimers(int cpu) +{ + struct ktimer_base *old_base, *new_base; + int i; + + BUG_ON(cpu_online(cpu)); + old_base = per_cpu(ktimer_bases, cpu); + new_base = get_cpu_var(ktimer_bases); + + local_irq_disable(); + + for (i = 0; i < MAX_KTIMER_BASES; i++) { + + spin_lock(&new_base->lock); + spin_lock(&old_base->lock); + + BUG_ON(old_base->curr_timer); + + migrate_ktimer_list(old_base, new_base); + + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); + old_base++; + new_base++; + } + + local_irq_enable(); + put_cpu_var(ktimer_bases); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __devinit ktimer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch(action) { + + case CPU_UP_PREPARE: + init_ktimers_cpu(cpu); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + migrate_ktimers(cpu); + break; +#endif + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata ktimers_nb = { + .notifier_call = ktimer_cpu_notify, +}; + +void __init ktimers_init(void) +{ + ktimer_cpu_notify(&ktimers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&ktimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(KTIMER_SOFTIRQ, run_ktimer_softirq, NULL); +#endif +} + Index: linux-2.6.14/kernel/posix-cpu-timers.c =================================================================== --- linux-2.6.14.orig/kernel/posix-cpu-timers.c +++ linux-2.6.14/kernel/posix-cpu-timers.c @@ -1407,7 +1407,7 @@ void set_process_cpu_timer(struct task_s static long posix_cpu_clock_nanosleep_restart(struct restart_block *); int posix_cpu_nsleep(clockid_t which_clock, int flags, - struct timespec *rqtp) + struct timespec *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = ¤t_thread_info()->restart_block; @@ -1432,7 +1432,6 @@ int posix_cpu_nsleep(clockid_t which_clo error = posix_cpu_timer_create(&timer); timer.it_process = current; if (!error) { - struct timespec __user *rmtp; static struct itimerspec zero_it; struct itimerspec it = { .it_value = *rqtp, .it_interval = {} }; @@ -1479,7 +1478,6 @@ int posix_cpu_nsleep(clockid_t which_clo /* * Report back to the user the time still remaining. */ - rmtp = (struct timespec __user *) restart_block->arg1; if (rmtp != NULL && !(flags & TIMER_ABSTIME) && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; @@ -1487,6 +1485,7 @@ int posix_cpu_nsleep(clockid_t which_clo restart_block->fn = posix_cpu_clock_nanosleep_restart; /* Caller already set restart_block->arg1 */ restart_block->arg0 = which_clock; + restart_block->arg1 = (unsigned long) rmtp; restart_block->arg2 = rqtp->tv_sec; restart_block->arg3 = rqtp->tv_nsec; @@ -1500,10 +1499,15 @@ static long posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->arg0; - struct timespec t = { .tv_sec = restart_block->arg2, - .tv_nsec = restart_block->arg3 }; + struct timespec __user *rmtp; + struct timespec t; + + rmtp = (struct timespec __user *) restart_block->arg1; + t.tv_sec = restart_block->arg2; + t.tv_nsec = restart_block->arg3; + restart_block->fn = do_no_restart_syscall; - return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t); + return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp); } @@ -1524,9 +1528,10 @@ static int process_cpu_timer_create(stru return posix_cpu_timer_create(timer); } static int process_cpu_nsleep(clockid_t which_clock, int flags, - struct timespec *rqtp) + struct timespec *rqtp, + struct timespec __user *rmtp) { - return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); + return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); } static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) { @@ -1542,7 +1547,7 @@ static int thread_cpu_timer_create(struc return posix_cpu_timer_create(timer); } static int thread_cpu_nsleep(clockid_t which_clock, int flags, - struct timespec *rqtp) + struct timespec *rqtp, struct timespec __user *rmtp) { return -EINVAL; } Index: linux-2.6.14/kernel/time.c =================================================================== --- linux-2.6.14.orig/kernel/time.c +++ linux-2.6.14/kernel/time.c @@ -38,6 +38,7 @@ #include #include +#include /* * The timezone where the local system is located. Used as a default by some @@ -128,6 +129,7 @@ asmlinkage long sys_gettimeofday(struct * as real UNIX machines always do it. This avoids all headaches about * daylight saving times and warping kernel clocks. */ +#ifndef CONFIG_GENERIC_TIME static inline void warp_clock(void) { write_seqlock_irq(&xtime_lock); @@ -137,6 +139,18 @@ static inline void warp_clock(void) write_sequnlock_irq(&xtime_lock); clock_was_set(); } +#else /* !CONFIG_GENERIC_TIME */ +/* XXX - this is somewhat cracked out and should + be checked -johnstul@us.ibm.com +*/ +static inline void warp_clock(void) +{ + struct timespec ts; + getnstimeofday(&ts); + ts.tv_sec += sys_tz.tz_minuteswest * 60; + do_settimeofday(&ts); +} +#endif /* !CONFIG_GENERIC_TIME */ /* * In case for some reason the CMOS clock has not already been running @@ -154,6 +168,9 @@ int do_sys_settimeofday(struct timespec static int firsttime = 1; int error = 0; + if (!timespec_valid(tv)) + return -EINVAL; + error = security_settime(tv, tz); if (error) return error; @@ -231,7 +248,9 @@ int do_adjtimex(struct timex *txc) { long ltemp, mtemp, save_adjust; int result; - + unsigned long flags; + struct timespec now_ts; + unsigned long seq; /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; @@ -254,7 +273,13 @@ int do_adjtimex(struct timex *txc) txc->tick > 1100000/USER_HZ) return -EINVAL; - write_seqlock_irq(&xtime_lock); + do { /* save off current xtime */ + seq = read_seqbegin(&xtime_lock); + now_ts = xtime; + } while (read_seqretry(&xtime_lock, seq)); + + write_seqlock_irqsave(&ntp_lock, flags); + result = time_state; /* mostly `TIME_OK' */ /* Save for later - semantics of adjtime is to return old value */ @@ -331,37 +356,27 @@ int do_adjtimex(struct timex *txc) */ if (time_status & STA_FREQHOLD || time_reftime == 0) - time_reftime = xtime.tv_sec; - mtemp = xtime.tv_sec - time_reftime; - time_reftime = xtime.tv_sec; + time_reftime = now_ts.tv_sec; + mtemp = now_ts.tv_sec - time_reftime; + time_reftime = now_ts.tv_sec; if (time_status & STA_FLL) { if (mtemp >= MINSEC) { ltemp = (time_offset / mtemp) << (SHIFT_USEC - SHIFT_UPDATE); - if (ltemp < 0) - time_freq -= -ltemp >> SHIFT_KH; - else - time_freq += ltemp >> SHIFT_KH; + time_freq += shift_right(ltemp, SHIFT_KH); } else /* calibration interval too short (p. 12) */ result = TIME_ERROR; } else { /* PLL mode */ if (mtemp < MAXSEC) { ltemp *= mtemp; - if (ltemp < 0) - time_freq -= -ltemp >> (time_constant + - time_constant + - SHIFT_KF - SHIFT_USEC); - else - time_freq += ltemp >> (time_constant + + time_freq += shift_right(ltemp,(time_constant + time_constant + - SHIFT_KF - SHIFT_USEC); + SHIFT_KF - SHIFT_USEC)); } else /* calibration interval too long (p. 12) */ result = TIME_ERROR; } - if (time_freq > time_tolerance) - time_freq = time_tolerance; - else if (time_freq < -time_tolerance) - time_freq = -time_tolerance; + time_freq = min(time_freq, time_tolerance); + time_freq = max(time_freq, -time_tolerance); } /* STA_PLL || STA_PPSTIME */ } /* txc->modes & ADJ_OFFSET */ if (txc->modes & ADJ_TICK) { @@ -384,10 +399,7 @@ leave: if ((time_status & (STA_UNSYNC|ST if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) txc->offset = save_adjust; else { - if (time_offset < 0) - txc->offset = -(-time_offset >> SHIFT_UPDATE); - else - txc->offset = time_offset >> SHIFT_UPDATE; + txc->offset = shift_right(time_offset, SHIFT_UPDATE); } txc->freq = time_freq + pps_freq; txc->maxerror = time_maxerror; @@ -405,7 +417,7 @@ leave: if ((time_status & (STA_UNSYNC|ST txc->calcnt = pps_calcnt; txc->errcnt = pps_errcnt; txc->stbcnt = pps_stbcnt; - write_sequnlock_irq(&xtime_lock); + write_sequnlock_irqrestore(&ntp_lock, flags); do_gettimeofday(&txc->time); notify_arch_cmos_timer(); return(result); @@ -486,6 +498,7 @@ struct timespec timespec_trunc(struct ti } EXPORT_SYMBOL(timespec_trunc); +#ifndef CONFIG_GENERIC_TIME #ifdef CONFIG_TIME_INTERPOLATION void getnstimeofday (struct timespec *tv) { @@ -522,10 +535,7 @@ int do_settimeofday (struct timespec *tv set_normalized_timespec(&xtime, sec, nsec); set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; + ntp_clear(); time_interpolator_reset(); } write_sequnlock_irq(&xtime_lock); @@ -573,6 +583,131 @@ void getnstimeofday(struct timespec *tv) EXPORT_SYMBOL_GPL(getnstimeofday); #endif +/** + * get_monotonic_clock_ns - Returns monotonically increasing nanoseconds + * + * Returns the monotonically increasing number of nanoseconds + * since the system booted. + */ +nsec_t get_monotonic_clock_ns(void) +{ + struct timespec ts, mo; + unsigned int seq; + + do { + seq = read_seqbegin(&xtime_lock); + getnstimeofday(&ts); + mo = wall_to_monotonic; + } while(read_seqretry(&xtime_lock, seq)); + + ts.tv_sec += mo.tv_sec; + ts.tv_nsec += mo.tv_nsec; + + return ((u64)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec; +} +EXPORT_SYMBOL_GPL(get_monotonic_clock_ns); + + + +#endif /* !CONFIG_GENERIC_TIME */ + +/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. + * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 + * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. + * + * [For the Julian calendar (which was used in Russia before 1917, + * Britain & colonies before 1752, anywhere else before 1582, + * and is still in use by some communities) leave out the + * -year/100+year/400 terms, and add 10.] + * + * This algorithm was first published by Gauss (I think). + * + * WARNING: this function will overflow on 2106-02-07 06:28:16 on + * machines were long is 32-bit! (However, as time_t is signed, we + * will already get problems at other places on 2038-01-19 03:14:08) + */ +unsigned long +mktime (unsigned int year, unsigned int mon, + unsigned int day, unsigned int hour, + unsigned int min, unsigned int sec) +{ + if (0 >= (int) (mon -= 2)) { /* 1..12 -> 11,12,1..10 */ + mon += 12; /* Puts Feb last since it has leap day */ + year -= 1; + } + + return ((((unsigned long) + (year/4 - year/100 + year/400 + 367*mon/12 + day) + + year*365 - 719499 + )*24 + hour /* now have hours */ + )*60 + min /* now have minutes */ + )*60 + sec; /* finally seconds */ +} + +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec (struct timespec *ts, time_t sec, long nsec) +{ + while (nsec > NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} + +/** + * ns_to_timespec - Convert nanoseconds to timespec + * + * @ts: pointer to timespec variable to store result + * @nsec: nanoseconds value to be converted + * + * Stores the timespec representation of the nanoseconds value in + * the timespec variable pointed to by @ts + */ +void ns_to_timespec(struct timespec *ts, nsec_t nsec) +{ + if (nsec) + ts->tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, + &ts->tv_nsec); + else + ts->tv_sec = ts->tv_nsec = 0; +} + +/** + * ns_to_timeval - Convert nanoseconds to timeval + * + * @tv: pointer to timeval variable to store result + * @nsec: nanoseconds value to be converted + * + * Stores the timeval representation of the nanoseconds value in + * the timeval variable pointed to by @tv + */ +void ns_to_timeval(struct timeval *tv, nsec_t nsec) +{ + struct timespec ts; + + ns_to_timespec(&ts, nsec); + tv->tv_sec = ts.tv_sec; + tv->tv_usec = (suseconds_t) ts.tv_nsec / 1000; +} + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { Index: linux-2.6.14/drivers/scsi/ncr53c8xx.c =================================================================== --- linux-2.6.14.orig/drivers/scsi/ncr53c8xx.c +++ linux-2.6.14/drivers/scsi/ncr53c8xx.c @@ -3481,8 +3481,8 @@ static int ncr_queue_command (struct ncb **---------------------------------------------------- */ if (np->settle_time && cmd->timeout_per_command >= HZ) { - u_long tlimit = ktime_get(cmd->timeout_per_command - HZ); - if (ktime_dif(np->settle_time, tlimit) > 0) + u_long tlimit = Ktime_get(cmd->timeout_per_command - HZ); + if (Ktime_dif(np->settle_time, tlimit) > 0) np->settle_time = tlimit; } @@ -3516,7 +3516,7 @@ static int ncr_queue_command (struct ncb ** Force ordered tag if necessary to avoid timeouts ** and to preserve interactivity. */ - if (lp && ktime_exp(lp->tags_stime)) { + if (lp && Ktime_exp(lp->tags_stime)) { if (lp->tags_smap) { order = M_ORDERED_TAG; if ((DEBUG_FLAGS & DEBUG_TAGS)||bootverbose>2){ @@ -3524,7 +3524,7 @@ static int ncr_queue_command (struct ncb "ordered tag forced.\n"); } } - lp->tags_stime = ktime_get(3*HZ); + lp->tags_stime = Ktime_get(3*HZ); lp->tags_smap = lp->tags_umap; } @@ -3792,7 +3792,7 @@ static int ncr_reset_scsi_bus(struct ncb u32 term; int retv = 0; - np->settle_time = ktime_get(settle_delay * HZ); + np->settle_time = Ktime_get(settle_delay * HZ); if (bootverbose > 1) printk("%s: resetting, " @@ -5044,7 +5044,7 @@ static void ncr_setup_tags (struct ncb * static void ncr_timeout (struct ncb *np) { - u_long thistime = ktime_get(0); + u_long thistime = Ktime_get(0); /* ** If release process in progress, let's go @@ -5057,7 +5057,7 @@ static void ncr_timeout (struct ncb *np) return; } - np->timer.expires = ktime_get(SCSI_NCR_TIMER_INTERVAL); + np->timer.expires = Ktime_get(SCSI_NCR_TIMER_INTERVAL); add_timer(&np->timer); /* @@ -5336,8 +5336,8 @@ void ncr_exception (struct ncb *np) **========================================================= */ - if (ktime_exp(np->regtime)) { - np->regtime = ktime_get(10*HZ); + if (Ktime_exp(np->regtime)) { + np->regtime = Ktime_get(10*HZ); for (i = 0; iregdump); i++) ((char*)&np->regdump)[i] = INB_OFF(i); np->regdump.nc_dstat = dstat; @@ -5453,7 +5453,7 @@ static int ncr_int_sbmc (struct ncb *np) ** Suspend command processing for 1 second and ** reinitialize all except the chip. */ - np->settle_time = ktime_get(1*HZ); + np->settle_time = Ktime_get(1*HZ); ncr_init (np, 0, bootverbose ? "scsi mode change" : NULL, HS_RESET); return 1; } @@ -6923,7 +6923,7 @@ static struct lcb *ncr_setup_lcb (struct for (i = 0 ; i < MAX_TAGS ; i++) lp->cb_tags[i] = i; lp->maxnxs = MAX_TAGS; - lp->tags_stime = ktime_get(3*HZ); + lp->tags_stime = Ktime_get(3*HZ); ncr_setup_tags (np, sdev); } Index: linux-2.6.14/drivers/scsi/sym53c8xx_defs.h =================================================================== --- linux-2.6.14.orig/drivers/scsi/sym53c8xx_defs.h +++ linux-2.6.14/drivers/scsi/sym53c8xx_defs.h @@ -285,12 +285,12 @@ ** kernel time values (jiffies) to some that have ** chance not to be too much incorrect. :-) */ -#define ktime_get(o) (jiffies + (u_long) o) -#define ktime_exp(b) ((long)(jiffies) - (long)(b) >= 0) -#define ktime_dif(a, b) ((long)(a) - (long)(b)) +#define Ktime_get(o) (jiffies + (u_long) o) +#define Ktime_exp(b) ((long)(jiffies) - (long)(b) >= 0) +#define Ktime_dif(a, b) ((long)(a) - (long)(b)) /* These ones are not used in this driver */ -#define ktime_add(a, o) ((a) + (u_long)(o)) -#define ktime_sub(a, o) ((a) - (u_long)(o)) +#define Ktime_add(a, o) ((a) + (u_long)(o)) +#define Ktime_sub(a, o) ((a) - (u_long)(o)) /* Index: linux-2.6.14/include/linux/timex.h =================================================================== --- linux-2.6.14.orig/include/linux/timex.h +++ linux-2.6.14/include/linux/timex.h @@ -260,6 +260,7 @@ extern long pps_calcnt; /* calibration extern long pps_errcnt; /* calibration errors */ extern long pps_stbcnt; /* stability limit exceeded */ +extern seqlock_t ntp_lock; /** * ntp_clear - Clears the NTP state variables * @@ -267,10 +268,14 @@ extern long pps_stbcnt; /* stability li */ static inline void ntp_clear(void) { + unsigned long flags; + write_seqlock_irqsave(&ntp_lock, flags); time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; + write_sequnlock_irqrestore(&ntp_lock, flags); + } /** @@ -282,6 +287,33 @@ static inline int ntp_synced(void) return !(time_status & STA_UNSYNC); } +/** + * ntp_get_ppm_adjustment - Returns Shifted PPM adjustment + * + */ +long ntp_get_ppm_adjustment(void); + +/** + * ntp_advance - Advances the NTP state machine by interval_ns + * + */ +void ntp_advance(unsigned long interval_ns); + +/** + * ntp_leapsecond - NTP leapsecond processing code. + * + */ +int ntp_leapsecond(struct timespec now); + + +/* Required to safely shift negative values */ +#define shift_right(x, s) ({ \ + __typeof__(x) __x = (x); \ + __typeof__(s) __s = (s); \ + __x < 0 ? -(-__x >> __s) : __x >> __s; \ +}) + +#ifndef CONFIG_GENERIC_TIME #ifdef CONFIG_TIME_INTERPOLATION @@ -337,6 +369,7 @@ time_interpolator_reset(void) } #endif /* !CONFIG_TIME_INTERPOLATION */ +#endif /* !CONFIG_GENERIC_TIME */ #endif /* KERNEL */ Index: linux-2.6.14/Documentation/kernel-parameters.txt =================================================================== --- linux-2.6.14.orig/Documentation/kernel-parameters.txt +++ linux-2.6.14/Documentation/kernel-parameters.txt @@ -52,6 +52,7 @@ restrictions referred to are that the re MTD MTD support is enabled. NET Appropriate network support is enabled. NUMA NUMA support is enabled. + GENERIC_TIME The generic timeofday code is enabled. NFS Appropriate NFS support is enabled. OSS OSS sound support is enabled. PARIDE The ParIDE subsystem is enabled. @@ -330,6 +331,7 @@ running once the system is up. /selinux/checkreqprot. clock= [BUGS=IA-32,HW] gettimeofday timesource override. + [Deprecated] Forces specified timesource (if avaliable) to be used when calculating gettimeofday(). If specicified timesource is not avalible, it defaults to PIT. @@ -1479,6 +1481,10 @@ running once the system is up. time Show timing data prefixed to each printk message line + clocksource= [GENERIC_TIME] Override the default clocksource + Override the default clocksource and use the clocksource + with the name specified. + tipar.timeout= [HW,PPT] Set communications timeout in tenths of a second (default 15). Index: linux-2.6.14/include/linux/clocksource.h =================================================================== --- /dev/null +++ linux-2.6.14/include/linux/clocksource.h @@ -0,0 +1,326 @@ +/* linux/include/linux/clocksource.h + * + * This file contains the structure definitions for clocksources. + * + * If you are not a clocksource, or the time of day code, you should + * not be including this file! + */ +#ifndef _LINUX_CLOCKSOURCE_H +#define _LINUX_CLOCKSOURCE_H + +#include +#include +#include +#include +#include +#include + +/** + * struct clocksource - hardware abstraction for a free running counter + * Provides mostly state-free accessors to the underlying hardware. + * + * @name: ptr to clocksource name + * @list: list head for registration + * @rating: rating value for selection (higher is better) + * To avoid rating inflation the following + * list should give you a guide as to how + * to assign your clocksource a rating + * 1-99: Unfit for real use + * Only available for bootup and testing purposes. + * 100-199: Base level usability. + * Functional for real use, but not desired. + * 200-299: Good. + * A correct and usable clocksource. + * 300-399: Desired. + * A reasonably fast and accurate clocksource. + * 400-499: Perfect + * The ideal clocksource. A must-use where + * available. + * @type: defines clocksource type + * @read_fnct: returns a cycle value + * @ptr: ptr to MMIO'ed counter + * @mask: bitmask for two's complement + * subtraction of non 64 bit counters + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + * @update_callback: called when safe to alter clocksource values + * @is_continuous: defines if clocksource is free-running. + */ +struct clocksource { + char* name; + struct list_head list; + int rating; + enum { + CLOCKSOURCE_FUNCTION, + CLOCKSOURCE_CYCLES, + CLOCKSOURCE_MMIO_32, + CLOCKSOURCE_MMIO_64 + } type; + cycle_t (*read_fnct)(void); + void __iomem *mmio_ptr; + cycle_t mask; + u32 mult; + u32 shift; + int (*update_callback)(void); + int is_continuous; +}; + + +/** + * clocksource_khz2mult - calculates mult from khz and shift + * @khz: Clocksource frequency in KHz + * @shift_constant: Clocksource shift factor + * + * Helper functions that converts a khz counter frequency to a timsource + * multiplier, given the clocksource shift value + */ +static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant) +{ + /* khz = cyc/(Million ns) + * mult/2^shift = ns/cyc + * mult = ns/cyc * 2^shift + * mult = 1Million/khz * 2^shift + * mult = 1000000 * 2^shift / khz + * mult = (1000000<type) { + case CLOCKSOURCE_MMIO_32: + return (cycle_t)readl(cs->mmio_ptr); + case CLOCKSOURCE_MMIO_64: + return (cycle_t)clocksource_readq(cs->mmio_ptr); + case CLOCKSOURCE_CYCLES: + return (cycle_t)get_cycles(); + default:/* case: CLOCKSOURCE_FUNCTION */ + return cs->read_fnct(); + } +} + +/** + * ppm_to_mult_adj - Converts shifted ppm values to mult adjustment + * @cs: Pointer to clocksource + * @ppm: Shifted PPM value + * + * Helper which converts a shifted ppm value to clocksource mult_adj value. + * + * XXX - this could use some optimization + */ +static inline int ppm_to_mult_adj(struct clocksource *cs, int ppm) +{ + u64 mult_adj; + int ret_adj; + + /* The basic math is as follows: + * cyc * mult/2^shift * (1 + ppm/MILL) = scaled ns + * We want to precalculate the ppm factor so it can be added + * to the multiplyer saving the extra multiplication step. + * cyc * (mult/2^shift + (mult/2^shift) * (ppm/MILL)) = + * cyc * (mult/2^shift + (mult*ppm/MILL)/2^shift) = + * cyc * (mult + (mult*ppm/MILL))/2^shift = + * Thus we want to calculate the value of: + * mult*ppm/MILL + */ + mult_adj = abs(ppm); + mult_adj = (mult_adj * cs->mult)>>SHIFT_USEC; + mult_adj += 1000000/2; /* round for div*/ + do_div(mult_adj, 1000000); + if (ppm < 0) + ret_adj = -(int)mult_adj; + else + ret_adj = (int)mult_adj; + return ret_adj; +} + +/** + * cyc2ns - converts clocksource cycles to nanoseconds + * @cs: Pointer to clocksource + * @ntp_adj: Multiplier adjustment value + * @cycles: Cycles + * + * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds. + * + * XXX - This could use some mult_lxl_ll() asm optimization + */ +static inline nsec_t cyc2ns(struct clocksource *cs, int ntp_adj, cycle_t cycles) +{ + u64 ret; + ret = (u64)cycles; + ret *= (cs->mult + ntp_adj); + ret >>= cs->shift; + return (nsec_t)ret; +} + +/** + * cyc2ns_rem - converts clocksource cycles to nanoseconds w/ remainder + * @cs: Pointer to clocksource + * @ntp_adj: Multiplier adjustment value + * @cycles: Cycles + * @rem: Remainder + * + * Uses the clocksource and ntp ajdustment interval to convert cycle_t to + * nanoseconds. Add in remainder portion which is stored in (ns<shift) + * units and save the new remainder off. + * + * XXX - This could use some mult_lxl_ll() asm optimization. + */ +static inline nsec_t cyc2ns_rem(struct clocksource *cs, int ntp_adj, cycle_t cycles, u64* rem) +{ + u64 ret; + ret = (u64)cycles; + ret *= (cs->mult + ntp_adj); + if (rem) { + ret += *rem; + *rem = ret & ((1<shift)-1); + } + ret >>= cs->shift; + return (nsec_t)ret; +} + + +/** + * struct clocksource_interval - Fixed interval conversion structure + * + * @cycles: A specified number of cycles + * @nsecs: The number of nanoseconds equivalent to the cycles value + * @remainder: Non-integer nanosecond remainder stored in (ns<shift) units + * @remainder_ns_overflow: Value at which the remainder is equal to + * one second + * + * This is a optimization structure used by cyc2ns_fixed_rem() to avoid the + * multiply in cyc2ns(). + * + * Unless you're the timeofday_periodic_hook, you should not be using this! + */ +struct clocksource_interval { + cycle_t cycles; + nsec_t nsecs; + u64 remainder; + u64 remainder_ns_overflow; +}; + +/** + * calculate_clocksource_interval - Calculates a clocksource interval struct + * + * @c: Pointer to clocksource. + * @adj: Multiplyer adjustment. + * @length_nsec: Desired interval length in nanoseconds. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timeofday_periodic_hook, you should not be using this! + */ +static inline struct clocksource_interval +calculate_clocksource_interval(struct clocksource *c, long adj, + unsigned long length_nsec) +{ + struct clocksource_interval ret; + u64 tmp; + + /* XXX - All of this could use a whole lot of optimization */ + tmp = length_nsec; + tmp <<= c->shift; + do_div(tmp, c->mult+adj); + + ret.cycles = (cycle_t)tmp; + if(ret.cycles == 0) + ret.cycles = 1; + + ret.remainder = 0; + ret.remainder_ns_overflow = 1 << c->shift; + ret.nsecs = cyc2ns_rem(c, adj, ret.cycles, &ret.remainder); + + return ret; +} + +/** + * cyc2ns_fixed_rem - + * converts clocksource cycles to nanoseconds using fixed intervals + * + * @interval: precalculated clocksource_interval structure + * @cycles: Number of clocksource cycles + * @rem: Remainder + * + * Uses a precalculated fixed cycle/nsec interval to convert cycles to + * nanoseconds. Returns the unaccumulated cycles in the cycles pointer as + * well as uses and updates the value at the remainder pointer + * + * Unless you're the timeofday_periodic_hook, you should not be using this! + */ +static inline nsec_t cyc2ns_fixed_rem(struct clocksource_interval interval, cycle_t *cycles, u64* rem) +{ + nsec_t delta_nsec = 0; + while(*cycles > interval.cycles) { + delta_nsec += interval.nsecs; + *cycles -= interval.cycles; + *rem += interval.remainder; + while(*rem > interval.remainder_ns_overflow) { + *rem -= interval.remainder_ns_overflow; + delta_nsec += 1; + } + } + return delta_nsec; +} + + +/* used to install a new clocksource */ +void register_clocksource(struct clocksource*); +void reselect_clocksource(void); +struct clocksource* get_next_clocksource(void); +#endif Index: linux-2.6.14/kernel/time/Makefile =================================================================== --- /dev/null +++ linux-2.6.14/kernel/time/Makefile @@ -0,0 +1 @@ +obj-y = clocksource.o jiffies.o clockevents.o timeofday.o Index: linux-2.6.14/kernel/time/clocksource.c =================================================================== --- /dev/null +++ linux-2.6.14/kernel/time/clocksource.c @@ -0,0 +1,286 @@ +/********************************************************************* +* linux/kernel/time/clocksource.c +* +* This file contains the functions which manage clocksource drivers. +* +* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +* TODO WishList: +* o Allow clocksource drivers to be unregistered +* o get rid of clocksource_jiffies extern +**********************************************************************/ + +#include +#include +#include +#include + +/* XXX - Would like a better way for initializing curr_clocksource */ +extern struct clocksource clocksource_jiffies; + +/*[Clocksource internal variables]--------- + * curr_clocksource: + * currently selected clocksource. Initialized to clocksource_jiffies. + * next_clocksource: + * pending next selected clocksource. + * clocksource_list: + * linked list with the registered clocksources + * clocksource_lock: + * protects manipulations to curr_clocksource and next_clocksource + * and the clocksource_list + * override_name: + * Name of the user-specified clocksource. + */ +static struct clocksource *curr_clocksource = &clocksource_jiffies; +static struct clocksource *next_clocksource; +static LIST_HEAD(clocksource_list); +static seqlock_t clocksource_lock = SEQLOCK_UNLOCKED; + +static char override_name[32]; + + +/** + * get_next_clocksource - Returns the selected clocksource + * + */ +struct clocksource *get_next_clocksource(void) +{ + write_seqlock(&clocksource_lock); + if (next_clocksource) { + curr_clocksource = next_clocksource; + next_clocksource = NULL; + } + write_sequnlock(&clocksource_lock); + + return curr_clocksource; +} + + +/** + * select_clocksource - Finds the best registered clocksource. + * + * Private function. Must have a writelock on clocksource_lock + * when called. + */ +static struct clocksource *select_clocksource(void) +{ + struct clocksource *best = NULL; + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (!best) + best = src; + + /* Check for override */ + if (strlen(src->name) == strlen(override_name) && + !strcmp(src->name, override_name)) { + best = src; + break; + } + /* Pick the highest rating */ + if (src->rating > best->rating) + best = src; + } + return best; +} + + +/** + * is_registered_source - Checks if clocksource is registered + * @c: pointer to a clocksource + * + * Private helper function, should not be used externally. + * + * Returns one if the clocksource is already registered, zero otherwise. + */ +static inline int is_registered_source(struct clocksource *c) +{ + struct list_head *tmp; + int len = strlen(c->name); + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (strlen(src->name) == len && !strcmp(src->name, c->name)) + return 1; + } + return 0; +} + + +/** + * register_clocksource - Used to install new clocksources + * @t: clocksource to be registered + * + */ +void register_clocksource(struct clocksource *c) +{ + write_seqlock(&clocksource_lock); + + /* check if clocksource is already registered */ + if (is_registered_source(c)) { + printk("register_clocksource: Cannot register %s. Already registered!", + c->name); + } else { + list_add(&c->list, &clocksource_list); + /* select next clocksource */ + next_clocksource = select_clocksource(); + } + write_sequnlock(&clocksource_lock); +} +EXPORT_SYMBOL(register_clocksource); + + +/** + * reselect_clocksource - Rescan list for next clocksource + * + * A quick helper function to be used if a clocksource changes its + * rating. Forces the clocksource list to be re-scaned for the best + * clocksource. + */ +void reselect_clocksource(void) +{ + write_seqlock(&clocksource_lock); + next_clocksource = select_clocksource(); + write_sequnlock(&clocksource_lock); +} + + +/** + * sysfs_show_clocksources - sysfs interface for listing clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing registered clocksources + */ +static ssize_t sysfs_show_clocksources(struct sys_device *dev, char *buf) +{ + char* curr = buf; + struct list_head *tmp; + + write_seqlock(&clocksource_lock); + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + /* Mark current clocksource w/ a star */ + if (src == curr_clocksource) + curr += sprintf(curr, "*"); + curr += sprintf(curr, "%s ", src->name); + } + write_sequnlock(&clocksource_lock); + + curr += sprintf(curr, "\n"); + return curr - buf; +} + + +/** + * sysfs_override_clocksource - interface for manually overriding clocksource + * @dev: unused + * @buf: name of override clocksource + * @count: length of buffer + * + * Takes input from sysfs interface for manually overriding the default + * clocksource selction + */ +static ssize_t sysfs_override_clocksource(struct sys_device *dev, + const char *buf, size_t count) +{ + /* Strings from sysfs write are not 0 terminated ! */ + if (count >= sizeof(override_name)) + return -EINVAL; + /* Strip of \n */ + if (buf[count-1] == '\n') + count--; + if (count < 1) + return -EINVAL; + + write_seqlock(&clocksource_lock); + + /* copy the name given */ + memcpy(override_name, buf, count); + override_name[count] = 0; + + /* try to select it */ + next_clocksource = select_clocksource(); + + write_sequnlock(&clocksource_lock); + return count; +} + + +/* Sysfs setup bits: + */ +static SYSDEV_ATTR(clocksource, 0600, sysfs_show_clocksources, sysfs_override_clocksource); + +static struct sysdev_class clocksource_sysclass = { + set_kset_name("clocksource"), +}; + +static struct sys_device device_clocksource = { + .id = 0, + .cls = &clocksource_sysclass, +}; + +static int init_clocksource_sysfs(void) +{ + int error = sysdev_class_register(&clocksource_sysclass); + if (!error) { + error = sysdev_register(&device_clocksource); + if (!error) + error = sysdev_create_file(&device_clocksource, &attr_clocksource); + } + return error; +} +device_initcall(init_clocksource_sysfs); + + +/** + * boot_override_clocksource - boot clock override + * @str: override name + * + * Takes a clocksource= boot argument and uses it + * as the clocksource override name + */ +static int __init boot_override_clocksource(char* str) +{ + if (str) + strlcpy(override_name, str, sizeof(override_name)); + return 1; +} +__setup("clocksource=", boot_override_clocksource); + + +/** + * boot_override_clock - Compatibility layer for deprecated boot option + * @str: override name + * + * DEPRECATED! Takes a clock= boot argument and uses it + * as the clocksource override name + */ +static int __init boot_override_clock(char* str) +{ + printk("Warning! clock= boot option is deprecated.\n"); + return boot_override_clocksource(str); +} +__setup("clock=", boot_override_clock); Index: linux-2.6.14/kernel/time/jiffies.c =================================================================== --- /dev/null +++ linux-2.6.14/kernel/time/jiffies.c @@ -0,0 +1,75 @@ +/*********************************************************************** +* linux/kernel/time/jiffies.c +* +* This file contains the jiffies based clocksource. +* +* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +************************************************************************/ +#include +#include +#include + +/* The Jiffies based clocksource is the lowest common + * denominator clock source which should function on + * all systems. It has the same coarse resolution as + * the timer interrupt frequency HZ and it suffers + * inaccuracies caused by missed or lost timer + * interrupts and the inability for the timer + * interrupt hardware to accuratly tick at the + * requested HZ value. It is also not reccomended + * for "tick-less" systems. + */ +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) + +/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. NSEC_PER_JIFFY grows as + * HZ shrinks, so values greater then 8 overflow 32bits when + * HZ=100. + */ +#define JIFFIES_SHIFT 8 + +static cycle_t jiffies_read(void) +{ + cycle_t ret = get_jiffies_64(); + return ret; +} + +struct clocksource clocksource_jiffies = { + .name = "jiffies", + .rating = 0, /* lowest rating*/ + .type = CLOCKSOURCE_FUNCTION, + .read_fnct = jiffies_read, + .mask = (cycle_t)-1, + .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* See above for details */ + .shift = JIFFIES_SHIFT, + .is_continuous = 0, /* tick based, not free running */ +}; + +static int __init init_jiffies_clocksource(void) +{ + register_clocksource(&clocksource_jiffies); + return 0; +} +module_init(init_jiffies_clocksource); Index: linux-2.6.14/drivers/char/hangcheck-timer.c =================================================================== --- linux-2.6.14.orig/drivers/char/hangcheck-timer.c +++ linux-2.6.14/drivers/char/hangcheck-timer.c @@ -49,6 +49,7 @@ #include #include #include +#include #define VERSION_STR "0.9.0" @@ -130,8 +131,12 @@ __setup("hcheck_dump_tasks", hangcheck_p #endif #ifdef HAVE_MONOTONIC +#ifndef CONFIG_GENERIC_TIME extern unsigned long long monotonic_clock(void); #else +#define monotonic_clock() get_monotonic_clock_ns() +#endif +#else static inline unsigned long long monotonic_clock(void) { # ifdef __s390__ Index: linux-2.6.14/include/asm-generic/timeofday.h =================================================================== --- /dev/null +++ linux-2.6.14/include/asm-generic/timeofday.h @@ -0,0 +1,29 @@ +/* linux/include/asm-generic/timeofday.h + * + * This file contains the asm-generic interface + * to the arch specific calls used by the time of day subsystem + */ +#ifndef _ASM_GENERIC_TIMEOFDAY_H +#define _ASM_GENERIC_TIMEOFDAY_H +#include +#include +#include +#include +#include + +#include +#ifdef CONFIG_GENERIC_TIME +/* Required externs */ +extern nsec_t read_persistent_clock(void); +extern void sync_persistent_clock(struct timespec ts); + +#ifdef CONFIG_GENERIC_TIME_VSYSCALL +extern void arch_update_vsyscall_gtod(struct timespec wall_time, + cycle_t offset_base, struct clocksource* clock, + int ntp_adj); +#else +#define arch_update_vsyscall_gtod(x,y,z,w) {} +#endif /* CONFIG_GENERIC_TIME_VSYSCALL */ + +#endif /* CONFIG_GENERIC_TIME */ +#endif Index: linux-2.6.14/include/linux/timeofday.h =================================================================== --- /dev/null +++ linux-2.6.14/include/linux/timeofday.h @@ -0,0 +1,47 @@ +/* linux/include/linux/timeofday.h + * + * This file contains the interface to the time of day subsystem + */ +#ifndef _LINUX_TIMEOFDAY_H +#define _LINUX_TIMEOFDAY_H +#include +#include +#include +#include +#include + +#ifdef CONFIG_GENERIC_TIME + +/* Kernel internal interfaces */ +extern ktime_t get_realtime_offset(void); +extern ktime_t get_realtime_clock(void); +extern ktime_t get_monotonic_clock(void); + +/* Timespec based interfaces for user space functionality */ +extern void get_realtime_clock_ts(struct timespec *ts); +extern void get_monotonic_clock_ts(struct timespec *ts); + +/* legacy timeofday interfaces*/ +#define getnstimeofday(ts) get_realtime_clock_ts(ts) +extern void do_gettimeofday(struct timeval *tv); +extern int do_settimeofday(struct timespec *ts); + +/* get_monotonic_clock_ns wrapper */ +#define get_monotonic_clock_ns() ktime_to_ns(get_monotonic_clock()) + +/* Internal functions */ +extern int timeofday_is_continuous(void); +extern void timeofday_init(void); + +#ifndef CONFIG_IS_TICK_BASED +#define arch_getoffset() (0) +#else +extern unsigned long arch_getoffset(void); +#endif + +#else /* CONFIG_GENERIC_TIME */ + +#define timeofday_init() + +#endif /* CONFIG_GENERIC_TIME */ +#endif /* _LINUX_TIMEOFDAY_H */ Index: linux-2.6.14/kernel/time/timeofday.c =================================================================== --- /dev/null +++ linux-2.6.14/kernel/time/timeofday.c @@ -0,0 +1,681 @@ +/* + * linux/kernel/time/timeofday.c + * + * This file contains the functions which access and manage + * the system's time of day functionality. + * + * Copyright (C) 2003, 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * TODO WishList: + * o See XXX's below. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Periodic hook interval */ +#define PERIODIC_INTERVAL_MS 50 + +/* [ktime_t based variables] + * system_time: + * Monotonically increasing counter of the number of nanoseconds + * since boot. + * wall_time_offset: + * Offset added to system_time to provide accurate time-of-day + */ +static ktime_t system_time; +static ktime_t wall_time_offset; + +/* [timespec based variables] + * These variable are mirroring the ktime_t based variabled to avoid + * performance problems in the userspace interface paths + * + * wall_time_ts: + * timespec holding the current wall time. + * mono_time_ts: + * timespec holding the current monotonic time. + * monotonic_time_offset_ts: + * timespec holding the difference between wall and monotonic time. + */ +static struct timespec wall_time_ts; +static struct timespec mono_time_ts; +static struct timespec monotonic_time_offset_ts; + +/* [Cycle based variables] + * cycle_last: + * Value of the clocksource at the last timeofday_periodic_hook() + * (adjusted only minorly to account for rounded off cycles) + */ +static cycle_t cycle_last; + +/* [Clocksource_interval variables] + * ts_interval: This clocksource_interval is used in the + * fixed interval cycles to nanosecond calculation. + * + * INTERVAL_LEN: This constant is the requested fixed + * interval period in nanoseconds. + */ +struct clocksource_interval ts_interval; +#define INTERVAL_LEN ((PERIODIC_INTERVAL_MS-1)*1000000) + +/* [Time source data] + * clocks: + * current clocksource pointer + */ +static struct clocksource *clock; + +/* [NTP adjustment] + * ntp_adj: + * value of the current ntp adjustment, + * stored in clocksource multiplier units. + */ +int ntp_adj; + +/* [Locks] + * system_time_lock: + * generic lock for all locally scoped time values + */ +static seqlock_t system_time_lock = SEQLOCK_UNLOCKED; + +/* [Suspend/Resume info] + * time_suspend_state: + * variable that keeps track of suspend state + * suspend_start: + * start time of the suspend call + */ +static enum { + TIME_RUNNING, + TIME_SUSPENDED +} time_suspend_state = TIME_RUNNING; + +static nsec_t suspend_start; + +/* [Soft-Timers] + * timeofday_timer: + * soft-timer used to call timeofday_periodic_hook() + */ +struct timer_list timeofday_timer; + + +/** + * update_legacy_time_values - sync legacy time values + * + * The function is necessary for a smooth transition to + * the new time keeping. When all the xtime/wall_to_monotonic + * users are converted this function can be removed. + * + * system_time_lock must be held by the caller + */ +static void update_legacy_time_values(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + + xtime = wall_time_ts; + set_normalized_timespec(&wall_to_monotonic, + -monotonic_time_offset_ts.tv_sec, + -monotonic_time_offset_ts.tv_nsec); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* since time state has changed, notify vsyscall code */ + arch_update_vsyscall_gtod(wall_time_ts, cycle_last, clock, ntp_adj); +} + +/** + * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * + * private function, must hold system_time_lock lock when being + * called. Returns the number of nanoseconds since the + * last call to timeofday_periodic_hook() (adjusted by NTP scaling) + */ +static inline nsec_t __get_nsec_offset(void) +{ + cycle_t cycle_now, cycle_delta; + nsec_t ns_offset; + + /* read clocksource */ + cycle_now = read_clocksource(clock); + + /* calculate the delta since the last timeofday_periodic_hook */ + cycle_delta = (cycle_now - cycle_last) & clock->mask; + + /* convert to nanoseconds */ + ns_offset = cyc2ns(clock, ntp_adj, cycle_delta); + + /* Special case for jiffies tick/offset based systems + * add arch specific offset + */ + ns_offset += arch_getoffset(); + + return ns_offset; +} + +/** + * __get_monotonic_clock - Returns monotonically increasing nanoseconds + * + * private function, must hold system_time_lock lock when being + * called. Returns the monotonically increasing number of + * nanoseconds since the system booted (adjusted by NTP scaling) + */ +static inline ktime_t __get_monotonic_clock(void) +{ + nsec_t offset = __get_nsec_offset(); + return ktime_add_ns(system_time, offset); +} + +/** + * get_monotonic_clock - Returns monotonically increasing nanoseconds + * + * Returns the monotonically increasing number of nanoseconds + * since the system booted via __monotonic_clock() + */ +ktime_t get_monotonic_clock(void) +{ + unsigned long seq; + ktime_t ret; + + /* atomically read __get_monotonic_clock_kt() */ + do { + seq = read_seqbegin(&system_time_lock); + + ret = __get_monotonic_clock(); + + } while (read_seqretry(&system_time_lock, seq)); + + return ret; +} + +EXPORT_SYMBOL_GPL(get_monotonic_clock); + +/** + * get_monotonic_clock_ts - Read monotonic time in timespec format + * + * @ts: pointer to the timespec to be set + * + * Read monotonic time in nanoseconds since the system booted and + * store the result in the timespec variable pointed to by @ts + */ +void get_monotonic_clock_ts(struct timespec *ts) +{ + unsigned long seq; + struct timespec mono_ts; + nsec_t offset; + + do { + seq = read_seqbegin(&system_time_lock); + + mono_ts = mono_time_ts; + offset = __get_nsec_offset(); + } while (read_seqretry(&system_time_lock, seq)); + + *ts = timespec_add_ns(mono_ts, offset); +} + +/** + * get_realtime_offset - Returns the offset of realtime clock + * + * Returns the number of nanoseconds in ktime_t storage format which + * represent the offset of the realtime clock to the monotonic clock + */ +ktime_t get_realtime_offset(void) +{ + unsigned long seq; + ktime_t ret; + + /* atomically read wall_time_offset */ + do { + seq = read_seqbegin(&system_time_lock); + + ret = wall_time_offset; + + } while (read_seqretry(&system_time_lock, seq)); + + return ret; +} + +/** + * get_realtime_clock - Read realtime clock in ktime_t format + * + * Returns the wall time in ktime_t format. The resolution is + * nanoseconds. + */ +ktime_t get_realtime_clock(void) +{ + unsigned long seq; + ktime_t ret; + + /* atomically read __get_monotonic_clock_kt() */ + do { + seq = read_seqbegin(&system_time_lock); + + ret = __get_monotonic_clock(); + ret = ktime_add(ret, wall_time_offset); + + } while (read_seqretry(&system_time_lock, seq)); + + return ret; +} + +/** + * get_realtime_clock_ts - Read the time of day into a timespec variable + * + * @ts: pointer to timespec variable to store current time + * + * Read time of day and store the result in the timespec variable + * pointed to by @ts + */ +void get_realtime_clock_ts(struct timespec *ts) +{ + struct timespec now_ts; + unsigned long seq; + nsec_t nsecs; + + do { + seq = read_seqbegin(&system_time_lock); + + now_ts = wall_time_ts; + nsecs = __get_nsec_offset(); + + } while (read_seqretry(&system_time_lock, seq)); + + *ts = timespec_add_ns(now_ts, nsecs); +} + +EXPORT_SYMBOL(get_realtime_clock_ts); + +/** + * do_gettimeofday - Read the time of day into a timeval variable + * + * @tv: pointer to timeval variable to store current time + * + * Read time of day and store the result in the timeval variable + * pointed to by @tv + * + * NOTE: The users should be converted to use get_realtime_clock_ts() + */ +void do_gettimeofday(struct timeval *tv) +{ + struct timespec now_ts; + + get_realtime_clock_ts(&now_ts); + tv->tv_sec = now_ts.tv_sec; + tv->tv_usec = now_ts.tv_nsec/1000; +} + +EXPORT_SYMBOL(do_gettimeofday); + +/** + * do_settimeofday - Sets the time of day + * + * @tv: pointer to the timespec variable containing the new time + * + * Set time of day and adjust the internal offsets, update NTP and + * the legacy time interfaces. + */ +int do_settimeofday(struct timespec *tv) +{ + unsigned long flags; + ktime_t newtime; + + newtime = timespec_to_ktime(*tv); + + write_seqlock_irqsave(&system_time_lock, flags); + + /* Calculate the new offset to the monotonic clock */ + wall_time_offset = ktime_sub(newtime, __get_monotonic_clock()); + /* Update the internal timespec variables */ + ktime_to_timespec(&wall_time_ts, + ktime_add(system_time, wall_time_offset)); + + ktime_to_timespec(&monotonic_time_offset_ts, wall_time_offset); + + ntp_clear(); + update_legacy_time_values(); + + write_sequnlock_irqrestore(&system_time_lock, flags); + + /* inform ktimers about time change */ + clock_was_set(); + + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +/** + * __increment_system_time - Increments system time + * + * @delta: nanosecond delta to add to the time variables + * + * Private helper that increments system_time and related + * timekeeping variables. + */ +static inline void __increment_system_time(nsec_t delta) +{ + system_time = ktime_add_ns(system_time, delta); + wall_time_ts = timespec_add_ns(wall_time_ts, delta); + mono_time_ts = timespec_add_ns(mono_time_ts, delta); +} + +/** + * timeofday_suspend_hook - allows the timeofday subsystem to be shutdown + * + * @dev: unused + * @state: unused + * + * This function allows the timeofday subsystem to be shutdown for a period + * of time. Called before going into suspend/hibernate mode. + */ +static int timeofday_suspend_hook(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + + write_seqlock_irqsave(&system_time_lock, flags); + + BUG_ON(time_suspend_state != TIME_RUNNING); + + /* First off, save suspend start time + * then quickly accumulate the current nsec offset. + * These two calls hopefully occur quickly + * because the difference between reads will + * accumulate as time drift on resume. + */ + suspend_start = read_persistent_clock(); + __increment_system_time(__get_nsec_offset()); + + time_suspend_state = TIME_SUSPENDED; + + write_sequnlock_irqrestore(&system_time_lock, flags); + + return 0; +} + +/** + * timeofday_resume_hook - Resumes the timeofday subsystem. + * + * @dev: unused + * + * This function resumes the timeofday subsystem from a previous call + * to timeofday_suspend_hook. + */ +static int timeofday_resume_hook(struct sys_device *dev) +{ + nsec_t suspend_end, suspend_time; + unsigned long flags; + + write_seqlock_irqsave(&system_time_lock, flags); + + BUG_ON(time_suspend_state != TIME_SUSPENDED); + + /* Read persistent clock to mark the end of + * the suspend interval then rebase the + * cycle_last to current clocksource value. + * Again, time between these two calls will + * not be accounted for and will show up as + * time drift. + */ + suspend_end = read_persistent_clock(); + cycle_last = read_clocksource(clock); + + /* calculate suspend time and add it to system time */ + suspend_time = suspend_end - suspend_start; + __increment_system_time(suspend_time); + + ntp_clear(); + + time_suspend_state = TIME_RUNNING; + + update_legacy_time_values(); + + write_sequnlock_irqrestore(&system_time_lock, flags); + + /* inform ktimers about time change */ + clock_was_set(); + + return 0; +} + +/* sysfs resume/suspend bits */ +static struct sysdev_class timeofday_sysclass = { + .resume = timeofday_resume_hook, + .suspend = timeofday_suspend_hook, + set_kset_name("timeofday"), +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timeofday_sysclass, +}; + +static int timeofday_init_device(void) +{ + int error = sysdev_class_register(&timeofday_sysclass); + + if (!error) + error = sysdev_register(&device_timer); + + return error; +} + +device_initcall(timeofday_init_device); + + +/** + * timeofday_periodic_hook - Does periodic update of timekeeping values. + * + * @unused: unused value + * + * Calculates the delta since the last call, updates system time and + * clears the offset. + * + * Called via timeofday_timer. + */ +static void timeofday_periodic_hook(unsigned long unused) +{ + unsigned long flags; + + cycle_t cycle_now, cycle_delta; + nsec_t delta_nsec; + static u64 remainder; + + long leapsecond; + struct clocksource* next; + + int ppm; + static int ppm_last; + + int something_changed = 0, clocksource_changed = 0; + struct clocksource old_clock; + static nsec_t second_check; + + write_seqlock_irqsave(&system_time_lock, flags); + + /* read time source & calc time since last call*/ + cycle_now = read_clocksource(clock); + cycle_delta = (cycle_now - cycle_last) & clock->mask; + + delta_nsec = cyc2ns_fixed_rem(ts_interval, &cycle_delta, &remainder); + cycle_last = (cycle_now - cycle_delta)&clock->mask; + + /* update system_time */ + __increment_system_time(delta_nsec); + + /* advance the ntp state machine by ns interval*/ + ntp_advance(delta_nsec); + + /* Only call ntp_leapsecond and ntp_sync once a sec */ + second_check += delta_nsec; + if (second_check > NSEC_PER_SEC) { + /* do ntp leap second processing*/ + leapsecond = ntp_leapsecond(wall_time_ts); + if (leapsecond) { + wall_time_offset = ktime_add_ns(wall_time_offset, + leapsecond * NSEC_PER_SEC); + wall_time_ts.tv_sec += leapsecond; + monotonic_time_offset_ts.tv_sec += leapsecond; + } + /* sync the persistent clock */ + if (ntp_synced()) + sync_persistent_clock(wall_time_ts); + second_check -= NSEC_PER_SEC; + } + + /* if necessary, switch clocksources */ + next = get_next_clocksource(); + if (next != clock) { + /* immediately set new cycle_last */ + cycle_last = read_clocksource(next); + /* update cycle_now to avoid problems in accumulation later */ + cycle_now = cycle_last; + /* swap clocksources */ + old_clock = *clock; + clock = next; + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); + ntp_clear(); + ntp_adj = 0; + remainder = 0; + something_changed = 1; + clocksource_changed = 1; + } + + /* now is a safe time, so allow clocksource to adjust + * itself (for example: to make cpufreq changes). + */ + if (clock->update_callback) { + /* since clocksource state might change, + * keep a copy, but only if we've not + * already changed timesources + */ + if (!something_changed) + old_clock = *clock; + if (clock->update_callback()) { + remainder = 0; + something_changed = 1; + } + } + + /* check for new PPM adjustment */ + ppm = ntp_get_ppm_adjustment(); + if (ppm_last != ppm) { + /* make sure old_clock is set */ + if (!something_changed) + old_clock = *clock; + something_changed = 1; + } + + /* if something changed, recalculate the ntp adjustment value */ + if (something_changed) { + /* accumulate current leftover cycles using old_clock */ + if (cycle_delta) { + delta_nsec = cyc2ns_rem(&old_clock, ntp_adj, + cycle_delta, &remainder); + cycle_last = cycle_now; + __increment_system_time(delta_nsec); + ntp_advance(delta_nsec); + } + + /* recalculate the ntp adjustment and fixed interval values */ + ppm_last = ppm; + ntp_adj = ppm_to_mult_adj(clock, ppm); + ts_interval = calculate_clocksource_interval(clock, ntp_adj, + INTERVAL_LEN); + } + + update_legacy_time_values(); + + write_sequnlock_irqrestore(&system_time_lock, flags); + + if (clocksource_changed) + ktimer_clock_notify(); + + /* Set us up to go off on the next interval */ + mod_timer(&timeofday_timer, + jiffies + msecs_to_jiffies(PERIODIC_INTERVAL_MS)); +} + +/** + * timeofday_is_continuous - check to see if timekeeping is free running + * + */ +int timeofday_is_continuous(void) +{ + unsigned long seq; + int ret; + do { + seq = read_seqbegin(&system_time_lock); + + ret = clock->is_continuous; + + } while (read_seqretry(&system_time_lock, seq)); + + return ret; +} + +/** + * timeofday_init - Initializes time variables + */ +void __init timeofday_init(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&system_time_lock, flags); + + /* initialize the clock variable */ + clock = get_next_clocksource(); + + /* initialize cycle_last offset base */ + cycle_last = read_clocksource(clock); + + /* initialize wall_time_offset to now*/ + /* XXX - this should be something like ns_to_ktime() */ + wall_time_offset = ktime_add_ns(wall_time_offset, + read_persistent_clock()); + + /* initialize timespec values */ + ktime_to_timespec(&wall_time_ts, + ktime_add(system_time, wall_time_offset)); + ktime_to_timespec(&monotonic_time_offset_ts, wall_time_offset); + + + /* clear NTP scaling factor & state machine */ + ntp_adj = 0; + ntp_clear(); + ts_interval = calculate_clocksource_interval(clock, ntp_adj, + INTERVAL_LEN); + + /* initialize legacy time values */ + update_legacy_time_values(); + + write_sequnlock_irqrestore(&system_time_lock, flags); + + /* Install timeofday_periodic_hook timer */ + init_timer(&timeofday_timer); + timeofday_timer.function = timeofday_periodic_hook; + timeofday_timer.expires = jiffies + 1; + add_timer(&timeofday_timer); +} Index: linux-2.6.14/arch/i386/kernel/Makefile =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/Makefile +++ linux-2.6.14/arch/i386/kernel/Makefile @@ -7,10 +7,9 @@ extra-y := head.o init_task.o vmlinux.ld obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ - doublefault.o quirks.o i8237.o + doublefault.o quirks.o i8237.o i8253.o tsc.o obj-y += cpu/ -obj-y += timers/ obj-$(CONFIG_ACPI) += acpi/ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca.o Index: linux-2.6.14/arch/i386/kernel/i8253.c =================================================================== --- /dev/null +++ linux-2.6.14/arch/i386/kernel/i8253.c @@ -0,0 +1,133 @@ +/* + * i8253.c 8253/PIT functions + * + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "io_ports.h" + +DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); + +static void init_pit_timer(int mode) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + + if (mode != CLOCK_EVT_ONESHOT) { + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + } else { + /* One shot setup */ + outb_p(0x38, PIT_MODE); + udelay(10); + } + + spin_unlock_irqrestore(&i8253_lock, flags); +} + +static void pit_next_event(unsigned long evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + outb_p(evt & 0xff , PIT_CH0); /* LSB */ + outb(evt >> 8 , PIT_CH0); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); +} + +static struct clock_event pit_clockevent = { + .name = "pit", + .capabilities = CLOCK_CAP_TICK +#ifndef CONFIG_SMP + | CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE | + CLOCK_CAP_UPDATE +#endif + , + .set_mode = init_pit_timer, + .set_next_event = pit_next_event, + .start_event = io_apic_timer_ack, + .end_event = mca_timer_ack, + .shift = 32, + .irq = 0, +}; + +void setup_pit_timer(void) +{ + pit_clockevent.mult = div_sc32(CLOCK_TICK_RATE, NSEC_PER_SEC); + pit_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFF, &pit_clockevent); + pit_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &pit_clockevent); + setup_global_clockevent(&pit_clockevent, CPU_MASK_NONE); +} + +/* Since the PIT overflows every tick, its not very useful + * to just read by itself. So use jiffies to emulate a free + * running counter. + */ + +static cycle_t pit_read(void) +{ + unsigned long flags, seq; + int count; + u64 jifs; + + do { + seq = read_seqbegin(&xtime_lock); + + spin_lock_irqsave(&i8253_lock, flags); + + outb_p(0x00, PIT_MODE); /* latch the count ASAP */ + count = inb_p(PIT_CH0); /* read the latched count */ + count |= inb_p(PIT_CH0) << 8; + + /* VIA686a test code... reset the latch if count > max + 1 */ + if (count > LATCH) { + outb_p(0x34, PIT_MODE); + outb_p(LATCH & 0xff, PIT_CH0); + outb(LATCH >> 8, PIT_CH0); + count = LATCH - 1; + } + spin_unlock_irqrestore(&i8253_lock, flags); + jifs = get_jiffies_64() - INITIAL_JIFFIES; + } while (read_seqretry(&xtime_lock, seq)); + + count = (LATCH-1) - count; + + return (cycle_t)(jifs * LATCH) + count; +} + +static struct clocksource clocksource_pit = { + .name = "pit", + .rating = 110, + .type = CLOCKSOURCE_FUNCTION, + .read_fnct = pit_read, + .mask = (cycle_t)-1, + .mult = 0, + .shift = 20, +}; + +static int __init init_pit_clocksource(void) +{ + clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); + register_clocksource(&clocksource_pit); + return 0; +} +module_init(init_pit_clocksource); Index: linux-2.6.14/arch/i386/kernel/timers/timer_pit.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/timers/timer_pit.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "do_timer.h" -#include "io_ports.h" - -static int count_p; /* counter in get_offset_pit() */ - -static int __init init_pit(char* override) -{ - /* check clock override */ - if (override[0] && strncmp(override,"pit",3)) - printk(KERN_ERR "Warning: clock= override failed. Defaulting to PIT\n"); - - count_p = LATCH; - return 0; -} - -static void mark_offset_pit(void) -{ - /* nothing needed */ -} - -static unsigned long long monotonic_clock_pit(void) -{ - return 0; -} - -static void delay_pit(unsigned long loops) -{ - int d0; - __asm__ __volatile__( - "\tjmp 1f\n" - ".align 16\n" - "1:\tjmp 2f\n" - ".align 16\n" - "2:\tdecl %0\n\tjns 2b" - :"=&a" (d0) - :"0" (loops)); -} - - -/* This function must be called with xtime_lock held. - * It was inspired by Steve McCanne's microtime-i386 for BSD. -- jrs - * - * However, the pc-audio speaker driver changes the divisor so that - * it gets interrupted rather more often - it loads 64 into the - * counter rather than 11932! This has an adverse impact on - * do_gettimeoffset() -- it stops working! What is also not - * good is that the interval that our timer function gets called - * is no longer 10.0002 ms, but 9.9767 ms. To get around this - * would require using a different timing source. Maybe someone - * could use the RTC - I know that this can interrupt at frequencies - * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix - * it so that at startup, the timer code in sched.c would select - * using either the RTC or the 8253 timer. The decision would be - * based on whether there was any other device around that needed - * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz, - * and then do some jiggery to have a version of do_timer that - * advanced the clock by 1/1024 s. Every time that reached over 1/100 - * of a second, then do all the old code. If the time was kept correct - * then do_gettimeoffset could just return 0 - there is no low order - * divider that can be accessed. - * - * Ideally, you would be able to use the RTC for the speaker driver, - * but it appears that the speaker driver really needs interrupt more - * often than every 120 us or so. - * - * Anyway, this needs more thought.... pjsg (1993-08-28) - * - * If you are really that interested, you should be reading - * comp.protocols.time.ntp! - */ - -static unsigned long get_offset_pit(void) -{ - int count; - unsigned long flags; - static unsigned long jiffies_p = 0; - - /* - * cache volatile jiffies temporarily; we have xtime_lock. - */ - unsigned long jiffies_t; - - spin_lock_irqsave(&i8253_lock, flags); - /* timer count may underflow right here */ - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ - - /* - * We do this guaranteed double memory access instead of a _p - * postfix in the previous port access. Wheee, hackady hack - */ - jiffies_t = jiffies; - - count |= inb_p(PIT_CH0) << 8; - - /* VIA686a test code... reset the latch if count > max + 1 */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - /* - * avoiding timer inconsistencies (they are rare, but they happen)... - * there are two kinds of problems that must be avoided here: - * 1. the timer counter underflows - * 2. hardware problem with the timer, not giving us continuous time, - * the counter does small "jumps" upwards on some Pentium systems, - * (see c't 95/10 page 335 for Neptun bug.) - */ - - if( jiffies_t == jiffies_p ) { - if( count > count_p ) { - /* the nutcase */ - count = do_timer_overflow(count); - } - } else - jiffies_p = jiffies_t; - - count_p = count; - - spin_unlock_irqrestore(&i8253_lock, flags); - - count = ((LATCH-1) - count) * TICK_SIZE; - count = (count + LATCH/2) / LATCH; - - return count; -} - - -/* tsc timer_opts struct */ -struct timer_opts timer_pit = { - .name = "pit", - .mark_offset = mark_offset_pit, - .get_offset = get_offset_pit, - .monotonic_clock = monotonic_clock_pit, - .delay = delay_pit, -}; - -struct init_timer_opts __initdata timer_pit_init = { - .init = init_pit, - .opts = &timer_pit, -}; - -void setup_pit_timer(void) -{ - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); - outb(LATCH >> 8 , PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); -} Index: linux-2.6.14/arch/i386/kernel/timers/common.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/timers/common.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Common functions used across the timers go here - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "mach_timer.h" - -/* ------ Calibrate the TSC ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). - * Too much 64-bit arithmetic here to do this cleanly in C, and for - * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) - * output busy loop as low as possible. We avoid reading the CTC registers - * directly because of the awkward 8-bit access mechanism of the 82C54 - * device. - */ - -#define CALIBRATE_TIME (5 * 1000020/HZ) - -unsigned long calibrate_tsc(void) -{ - mach_prepare_counter(); - - { - unsigned long startlow, starthigh; - unsigned long endlow, endhigh; - unsigned long count; - - rdtsc(startlow,starthigh); - mach_countup(&count); - rdtsc(endlow,endhigh); - - - /* Error: ECTCNEVERSET */ - if (count <= 1) - goto bad_ctc; - - /* 64-bit subtract - gcc just messes up with long longs */ - __asm__("subl %2,%0\n\t" - "sbbl %3,%1" - :"=a" (endlow), "=d" (endhigh) - :"g" (startlow), "g" (starthigh), - "0" (endlow), "1" (endhigh)); - - /* Error: ECPUTOOFAST */ - if (endhigh) - goto bad_ctc; - - /* Error: ECPUTOOSLOW */ - if (endlow <= CALIBRATE_TIME) - goto bad_ctc; - - __asm__("divl %2" - :"=a" (endlow), "=d" (endhigh) - :"r" (endlow), "0" (0), "1" (CALIBRATE_TIME)); - - return endlow; - } - - /* - * The CTC wasn't reliable: we got a hit on the very first read, - * or the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ -bad_ctc: - return 0; -} - -#ifdef CONFIG_HPET_TIMER -/* ------ Calibrate the TSC using HPET ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq. - * Second output is parameter 1 (when non NULL) - * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet(). - * calibrate_tsc() calibrates the processor TSC by comparing - * it to the HPET timer of known frequency. - * Too much 64-bit arithmetic here to do this cleanly in C - */ -#define CALIBRATE_CNT_HPET (5 * hpet_tick) -#define CALIBRATE_TIME_HPET (5 * KERNEL_TICK_USEC) - -unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr) -{ - unsigned long tsc_startlow, tsc_starthigh; - unsigned long tsc_endlow, tsc_endhigh; - unsigned long hpet_start, hpet_end; - unsigned long result, remain; - - hpet_start = hpet_readl(HPET_COUNTER); - rdtsc(tsc_startlow, tsc_starthigh); - do { - hpet_end = hpet_readl(HPET_COUNTER); - } while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET); - rdtsc(tsc_endlow, tsc_endhigh); - - /* 64-bit subtract - gcc just messes up with long longs */ - __asm__("subl %2,%0\n\t" - "sbbl %3,%1" - :"=a" (tsc_endlow), "=d" (tsc_endhigh) - :"g" (tsc_startlow), "g" (tsc_starthigh), - "0" (tsc_endlow), "1" (tsc_endhigh)); - - /* Error: ECPUTOOFAST */ - if (tsc_endhigh) - goto bad_calibration; - - /* Error: ECPUTOOSLOW */ - if (tsc_endlow <= CALIBRATE_TIME_HPET) - goto bad_calibration; - - ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET); - if (remain > (tsc_endlow >> 1)) - result++; /* rounding the result */ - - if (tsc_hpet_quotient_ptr) { - unsigned long tsc_hpet_quotient; - - ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0, - CALIBRATE_CNT_HPET); - if (remain > (tsc_endlow >> 1)) - tsc_hpet_quotient++; /* rounding the result */ - *tsc_hpet_quotient_ptr = tsc_hpet_quotient; - } - - return result; -bad_calibration: - /* - * the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ - return 0; -} -#endif - - -unsigned long read_timer_tsc(void) -{ - unsigned long retval; - rdtscl(retval); - return retval; -} - - -/* calculate cpu_khz */ -void init_cpu_khz(void) -{ - if (cpu_has_tsc) { - unsigned long tsc_quotient = calibrate_tsc(); - if (tsc_quotient) { - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (cpu_khz), "=d" (edx) - :"r" (tsc_quotient), - "0" (eax), "1" (edx)); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - } - } -} - Index: linux-2.6.14/arch/i386/kernel/timers/timer_tsc.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/timers/timer_tsc.c +++ /dev/null @@ -1,595 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - * - * 2004-06-25 Jesper Juhl - * moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4 - * failing to inline. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -/* processor.h for distable_tsc flag */ -#include - -#include "io_ports.h" -#include "mach_timer.h" - -#include -#include - -#ifdef CONFIG_HPET_TIMER -static unsigned long hpet_usec_quotient; -static unsigned long hpet_last; -static struct timer_opts timer_tsc; -#endif - -static inline void cpufreq_delayed_get(void); - -int tsc_disable __devinitdata = 0; - -static int use_tsc; -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; - -static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_mhz * 10^6)) - * ns = cycles * (10^3 / cpu_mhz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^3 * SC / cpu_mhz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_mhz) -{ - cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -static int count2; /* counter for mark_offset_tsc() */ - -/* Cached *multiplier* to convert TSC counts to microseconds. - * (see the equation below). - * Equal to 2^32 * (1 / (clocks per usec) ). - * Initialized in time_init. - */ -static unsigned long fast_gettimeoffset_quotient; - -static unsigned long get_offset_tsc(void) -{ - register unsigned long eax, edx; - - /* Read the Time Stamp Counter */ - - rdtsc(eax,edx); - - /* .. relative to previous jiffy (32 bits is enough) */ - eax -= last_tsc_low; /* tsc_low delta */ - - /* - * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient - * = (tsc_low delta) * (usecs_per_clock) - * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) - * - * Using a mull instead of a divl saves up to 31 clock cycles - * in the critical path. - */ - - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - - /* our adjusted time offset in microseconds */ - return delay_at_last_interrupt + edx; -} - -static unsigned long long monotonic_clock_tsc(void) -{ - unsigned long long last_offset, this_offset, base; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return base + cycles_2_ns(this_offset - last_offset); -} - -/* - * Scheduler clock - returns current time in nanosec units. - */ -unsigned long long sched_clock(void) -{ - unsigned long long this_offset; - - /* - * In the NUMA case we dont use the TSC as they are not - * synchronized across all CPUs. - */ -#ifndef CONFIG_NUMA - if (!use_tsc) -#endif - /* no locking but a rare wrong value is not a big deal */ - return jiffies_64 * (1000000000 / HZ); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return cycles_2_ns(this_offset); -} - -static void delay_tsc(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - -#ifdef CONFIG_HPET_TIMER -static void mark_offset_tsc_hpet(void) -{ - unsigned long long this_offset, last_offset; - unsigned long offset, temp, hpet_current; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - /* read Pentium cycle counter */ - - hpet_current = hpet_readl(HPET_COUNTER); - rdtsc(last_tsc_low, last_tsc_high); - - /* lost tick compensation */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))) { - int lost_ticks = (offset - hpet_last) / hpet_tick; - jiffies_64 += lost_ticks; - } - hpet_last = hpet_current; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - /* - * Time offset = (hpet delta) * ( usecs per HPET clock ) - * = (hpet delta) * ( usecs per tick / HPET clocks per tick) - * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) - * Where, - * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick - */ - delay_at_last_interrupt = hpet_current - offset; - ASM_MUL64_REG(temp, delay_at_last_interrupt, - hpet_usec_quotient, delay_at_last_interrupt); -} -#endif - - -#ifdef CONFIG_CPU_FREQ -#include - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(void *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static inline void cpufreq_delayed_get(void) -{ - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); - schedule_work(&cpufreq_delayed_get_work); - } -} - -/* If the CPU frequency is scaled, TSC-based delays will need a different - * loops_per_jiffy value to function properly. - */ - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -#ifndef CONFIG_SMP -static unsigned long fast_gettimeoffset_ref = 0; -static unsigned int cpu_khz_ref = 0; -#endif - -static int -time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - - if (val != CPUFREQ_RESUMECHANGE) - write_seqlock_irq(&xtime_lock); - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; -#ifndef CONFIG_SMP - fast_gettimeoffset_ref = fast_gettimeoffset_quotient; - cpu_khz_ref = cpu_khz; -#endif - } - - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); -#ifndef CONFIG_SMP - if (cpu_khz) - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (use_tsc) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { - fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq); - set_cyc2ns_scale(cpu_khz/1000); - } - } -#endif - } - - if (val != CPUFREQ_RESUMECHANGE) - write_sequnlock_irq(&xtime_lock); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - - -static int __init cpufreq_tsc(void) -{ - int ret; - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); - ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - if (!ret) - cpufreq_init = 1; - return ret; -} -core_initcall(cpufreq_tsc); - -#else /* CONFIG_CPU_FREQ */ -static inline void cpufreq_delayed_get(void) { return; } -#endif - -int recalibrate_cpu_khz(void) -{ -#ifndef CONFIG_SMP - unsigned int cpu_khz_old = cpu_khz; - - if (cpu_has_tsc) { - init_cpu_khz(); - cpu_data[0].loops_per_jiffy = - cpufreq_scale(cpu_data[0].loops_per_jiffy, - cpu_khz_old, - cpu_khz); - return 0; - } else - return -ENODEV; -#else - return -ENODEV; -#endif -} -EXPORT_SYMBOL(recalibrate_cpu_khz); - -static void mark_offset_tsc(void) -{ - unsigned long lost,delay; - unsigned long delta = last_tsc_low; - int count; - int countmp; - static int count1 = 0; - unsigned long long this_offset, last_offset; - static int lost_count = 0; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - - /* read Pentium cycle counter */ - - rdtsc(last_tsc_low, last_tsc_high); - - spin_lock(&i8253_lock); - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ - count |= inb(PIT_CH0) << 8; - - /* - * VIA686a test code... reset the latch if count > max + 1 - * from timer_pit.c - cjb - */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - spin_unlock(&i8253_lock); - - if (pit_latch_buggy) { - /* get center value of last 3 time lutch */ - if ((count2 >= count && count >= count1) - || (count1 >= count && count >= count2)) { - count2 = count1; count1 = count; - } else if ((count1 >= count2 && count2 >= count) - || (count >= count2 && count2 >= count1)) { - countmp = count;count = count2; - count2 = count1;count1 = countmp; - } else { - count2 = count1; count1 = count; count = count1; - } - } - - /* lost tick compensation */ - delta = last_tsc_low - delta; - { - register unsigned long eax, edx; - eax = delta; - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - delta = edx; - } - delta += delay_at_last_interrupt; - lost = delta/(1000000/HZ); - delay = delta%(1000000/HZ); - if (lost >= 2) { - jiffies_64 += lost-1; - - /* sanity check to ensure we're not always losing ticks */ - if (lost_count++ > 100) { - printk(KERN_WARNING "Losing too many ticks!\n"); - printk(KERN_WARNING "TSC cannot be used as a timesource. \n"); - printk(KERN_WARNING "Possible reasons for this are:\n"); - printk(KERN_WARNING " You're running with Speedstep,\n"); - printk(KERN_WARNING " You don't have DMA enabled for your hard disk (see hdparm),\n"); - printk(KERN_WARNING " Incorrect TSC synchronization on an SMP system (see dmesg).\n"); - printk(KERN_WARNING "Falling back to a sane timesource now.\n"); - - clock_fallback(); - } - /* ... but give the TSC a fair chance */ - if (lost_count > 25) - cpufreq_delayed_get(); - } else - lost_count = 0; - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - count = ((LATCH-1) - count) * TICK_SIZE; - delay_at_last_interrupt = (count + LATCH/2) / LATCH; - - /* catch corner case where tick rollover occured - * between tsc and pit reads (as noted when - * usec delta is > 90% # of usecs/tick) - */ - if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) - jiffies_64++; -} - -static int __init init_tsc(char* override) -{ - - /* check clock override */ - if (override[0] && strncmp(override,"tsc",3)) { -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) { - printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n"); - } else -#endif - { - return -ENODEV; - } - } - - /* - * If we have APM enabled or the CPU clock speed is variable - * (CPU stops clock on HLT or slows clock to save power) - * then the TSC timestamps may diverge by up to 1 jiffy from - * 'real time' but nothing will break. - * The most frequent case is that the CPU is "woken" from a halt - * state by the timer interrupt itself, so we get 0 error. In the - * rare cases where a driver would "wake" the CPU and request a - * timestamp, the maximum error is < 1 jiffy. But timestamps are - * still perfectly ordered. - * Note that the TSC counter will be reset if APM suspends - * to disk; this won't break the kernel, though, 'cuz we're - * smart. See arch/i386/kernel/apm.c. - */ - /* - * Firstly we have to do a CPU check for chips with - * a potentially buggy TSC. At this point we haven't run - * the ident/bugs checks so we must run this hook as it - * may turn off the TSC flag. - * - * NOTE: this doesn't yet handle SMP 486 machines where only - * some CPU's have a TSC. Thats never worked and nobody has - * moaned if you have the only one in the world - you fix it! - */ - - count2 = LATCH; /* initialize counter for mark_offset_tsc() */ - - if (cpu_has_tsc) { - unsigned long tsc_quotient; -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled() && hpet_use_timer) { - unsigned long result, remain; - printk("Using TSC for gettimeofday\n"); - tsc_quotient = calibrate_tsc_hpet(NULL); - timer_tsc.mark_offset = &mark_offset_tsc_hpet; - /* - * Math to calculate hpet to usec multiplier - * Look for the comments at get_offset_tsc_hpet() - */ - ASM_DIV64_REG(result, remain, hpet_tick, - 0, KERNEL_TICK_USEC); - if (remain > (hpet_tick >> 1)) - result++; /* rounding the result */ - - hpet_usec_quotient = result; - } else -#endif - { - tsc_quotient = calibrate_tsc(); - } - - if (tsc_quotient) { - fast_gettimeoffset_quotient = tsc_quotient; - use_tsc = 1; - /* - * We could be more selective here I suspect - * and just enable this for the next intel chips ? - */ - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (cpu_khz), "=d" (edx) - :"r" (tsc_quotient), - "0" (eax), "1" (edx)); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - set_cyc2ns_scale(cpu_khz/1000); - return 0; - } - } - return -ENODEV; -} - -static int tsc_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - rdtsc(last_tsc_low, last_tsc_high); -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled() && hpet_use_timer) - hpet_last = hpet_readl(HPET_COUNTER); -#endif - write_sequnlock(&monotonic_lock); - return 0; -} - -#ifndef CONFIG_X86_TSC -/* disable flag for tsc. Takes effect by clearing the TSC cpu flag - * in cpu/common.c */ -static int __init tsc_setup(char *str) -{ - tsc_disable = 1; - return 1; -} -#else -static int __init tsc_setup(char *str) -{ - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC.\n"); - return 1; -} -#endif -__setup("notsc", tsc_setup); - - - -/************************************************************/ - -/* tsc timer_opts struct */ -static struct timer_opts timer_tsc = { - .name = "tsc", - .mark_offset = mark_offset_tsc, - .get_offset = get_offset_tsc, - .monotonic_clock = monotonic_clock_tsc, - .delay = delay_tsc, - .read_timer = read_timer_tsc, - .resume = tsc_resume, -}; - -struct init_timer_opts __initdata timer_tsc_init = { - .init = init_tsc, - .opts = &timer_tsc, -}; Index: linux-2.6.14/arch/i386/kernel/tsc.c =================================================================== --- /dev/null +++ linux-2.6.14/arch/i386/kernel/tsc.c @@ -0,0 +1,379 @@ +/* + * This code largely moved from arch/i386/kernel/timer/timer_tsc.c + * which was originally moved from arch/i386/kernel/time.c. + * See comments there for proper credits. + */ + +#include +#include +#include +#include +#include "mach_timer.h" + +/* On some systems the TSC frequency does not + * change with the cpu frequency. So we need + * an extra value to store the TSC freq + */ +unsigned int tsc_khz; + +int tsc_disable __initdata = 0; +#ifndef CONFIG_X86_TSC +/* disable flag for tsc. Takes effect by clearing the TSC cpu flag + * in cpu/common.c */ +static int __init tsc_setup(char *str) +{ + tsc_disable = 1; + return 1; +} +#else +static int __init tsc_setup(char *str) +{ + printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " + "cannot disable TSC.\n"); + return 1; +} +#endif +__setup("notsc", tsc_setup); + + +int read_current_timer(unsigned long *timer_val) +{ + if (!tsc_disable && cpu_khz) { + rdtscl(*timer_val); + return 0; + } + return -1; +} + +/* Code to mark and check if the TSC is unstable + * due to cpufreq or due to unsynced TSCs + */ +static int tsc_unstable; +static inline int check_tsc_unstable(void) +{ + return tsc_unstable; +} + +void mark_tsc_unstable(void) +{ + tsc_unstable = 1; +} + +/* Code to compensate for C3 stalls */ +static u64 tsc_c3_offset; +void tsc_c3_compensate(unsigned long nsecs) +{ + /* this could def be optimized */ + u64 cycles = ((u64)nsecs * tsc_khz); + do_div(cycles, 1000000); + tsc_c3_offset += cycles; +} + +EXPORT_SYMBOL_GPL(tsc_c3_compensate); + +static inline u64 tsc_read_c3_time(void) +{ + return tsc_c3_offset; +} + +/* Accellerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_mhz * 10^6)) + * ns = cycles * (10^3 / cpu_mhz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^3 * SC / cpu_mhz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static unsigned long cyc2ns_scale; +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +{ + cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + unsigned long long this_offset; + + /* + * In the NUMA case we dont use the TSC as they are not + * synchronized across all CPUs. + */ +#ifndef CONFIG_NUMA + if (!cpu_khz || check_tsc_unstable()) +#endif + /* no locking but a rare wrong value is not a big deal */ + return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); + + /* Read the Time Stamp Counter */ + rdtscll(this_offset); + this_offset += tsc_read_c3_time(); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + + +static unsigned long calculate_cpu_khz(void) +{ + unsigned long long start, end; + unsigned long count; + u64 delta64; + int i; + /* run 3 times to ensure the cache is warm */ + for(i=0; i<3; i++) { + mach_prepare_counter(); + rdtscll(start); + mach_countup(&count); + rdtscll(end); + } + /* Error: ECTCNEVERSET + * The CTC wasn't reliable: we got a hit on the very first read, + * or the CPU was so fast/slow that the quotient wouldn't fit in + * 32 bits.. + */ + if (count <= 1) + return 0; + + delta64 = end - start; + + /* cpu freq too fast */ + if(delta64 > (1ULL<<32)) + return 0; + /* cpu freq too slow */ + if (delta64 <= CALIBRATE_TIME_MSEC) + return 0; + + delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */ + do_div(delta64,CALIBRATE_TIME_MSEC); + + return (unsigned long)delta64; +} + +int recalibrate_cpu_khz(void) +{ +#ifndef CONFIG_SMP + unsigned long cpu_khz_old = cpu_khz; + + if (cpu_has_tsc) { + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; + cpu_data[0].loops_per_jiffy = + cpufreq_scale(cpu_data[0].loops_per_jiffy, + cpu_khz_old, cpu_khz); + return 0; + } else + return -ENODEV; +#else + return -ENODEV; +#endif +} +EXPORT_SYMBOL(recalibrate_cpu_khz); + + +void tsc_init(void) +{ + if(!cpu_has_tsc || tsc_disable) + return; + + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; + + if (!cpu_khz) + return; + + printk("Detected %lu.%03lu MHz processor.\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); + + set_cyc2ns_scale(cpu_khz/1000); +} + + +#ifdef CONFIG_CPU_FREQ +#include + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(void *v) +{ + unsigned int cpu; + for_each_online_cpu(cpu) { + cpufreq_get(cpu); + } + cpufreq_delayed_issched = 0; +} + +/* if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries + * to verify the CPU frequency the timing core thinks the CPU is running + * at is still correct. + */ +static inline void cpufreq_delayed_get(void) +{ + if (cpufreq_init && !cpufreq_delayed_issched) { + cpufreq_delayed_issched = 1; + printk(KERN_DEBUG "Checking if CPU frequency changed.\n"); + schedule_work(&cpufreq_delayed_get_work); + } +} + +/* If the CPU frequency is scaled, TSC-based delays will need a different + * loops_per_jiffy value to function properly. + */ + +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; + +#ifndef CONFIG_SMP +static unsigned long cpu_khz_ref = 0; +#endif + +static int time_cpufreq_notifier(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = data; + + if (val != CPUFREQ_RESUMECHANGE) + write_seqlock_irq(&xtime_lock); + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; +#ifndef CONFIG_SMP + cpu_khz_ref = cpu_khz; +#endif + } + + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + + if (cpu_khz) { +#ifndef CONFIG_SMP + cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); +#endif + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { + tsc_khz = cpu_khz; + set_cyc2ns_scale(cpu_khz/1000); + /* TSC based sched_clock turns + * to junk w/ cpufreq + */ + mark_tsc_unstable(); + } + } + } + + if (val != CPUFREQ_RESUMECHANGE) + write_sequnlock_irq(&xtime_lock); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + + +static int __init cpufreq_tsc(void) +{ + int ret; + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); + ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + if (!ret) + cpufreq_init = 1; + + return ret; +} +core_initcall(cpufreq_tsc); + +#endif + +/* Clock source code */ +#include + +static unsigned long current_tsc_khz = 0; +static cycle_t read_tsc_c3(void); +static int tsc_update_callback(void); + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .type = CLOCKSOURCE_CYCLES, + .mask = (cycle_t)-1, + .mult = 0, /* to be set */ + .shift = 22, + .update_callback = tsc_update_callback, + .is_continuous = 1, +}; + +static cycle_t read_tsc_c3(void) +{ + cycle_t ret; + rdtscll(ret); + return ret + tsc_read_c3_time(); +} + +static int tsc_update_callback(void) +{ + int change = 0; + /* check to see if we should switch to the safe clocksource */ + if (tsc_read_c3_time() && + strncmp(clocksource_tsc.name, "c3tsc", 5)) { + printk("Falling back to C3 safe TSC\n"); + clocksource_tsc.read_fnct = read_tsc_c3; + clocksource_tsc.type = CLOCKSOURCE_FUNCTION; + clocksource_tsc.name = "c3tsc"; + change = 1; + } + + if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { + clocksource_tsc.rating = 50; + reselect_clocksource(); + change = 1; + } + /* only update if tsc_khz has changed */ + if (current_tsc_khz != tsc_khz){ + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + change = 1; + } + return change; +} + +static int __init init_tsc_clocksource(void) +{ + + /* TSC initialization is done in arch/i386/kernel/tsc.c */ + if (cpu_has_tsc && tsc_khz) { + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + register_clocksource(&clocksource_tsc); + } + return 0; +} + +module_init(init_tsc_clocksource); + Index: linux-2.6.14/include/asm-i386/timex.h =================================================================== --- linux-2.6.14.orig/include/asm-i386/timex.h +++ linux-2.6.14/include/asm-i386/timex.h @@ -8,6 +8,7 @@ #include #include +#include #ifdef CONFIG_X86_ELAN # define CLOCK_TICK_RATE 1189200 /* AMD Elan has different frequency! */ @@ -16,39 +17,6 @@ #endif -/* - * Standard way to access the cycle counter on i586+ CPUs. - * Currently only used on SMP. - * - * If you really have a SMP machine with i486 chips or older, - * compile for that, and this will just always return zero. - * That's ok, it just means that the nicer scheduling heuristics - * won't work for you. - * - * We only use the low 32 bits, and we'd simply better make sure - * that we reschedule before that wraps. Scheduling at least every - * four billion cycles just basically sounds like a good idea, - * regardless of how fast the machine is. - */ -typedef unsigned long long cycles_t; - -static inline cycles_t get_cycles (void) -{ - unsigned long long ret=0; - -#ifndef CONFIG_X86_TSC - if (!cpu_has_tsc) - return 0; -#endif - -#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) - rdtscll(ret); -#endif - return ret; -} - -extern unsigned int cpu_khz; - extern int read_current_timer(unsigned long *timer_value); #define ARCH_HAS_READ_CURRENT_TIMER 1 Index: linux-2.6.14/include/asm-i386/tsc.h =================================================================== --- /dev/null +++ linux-2.6.14/include/asm-i386/tsc.h @@ -0,0 +1,48 @@ +/* + * linux/include/asm-i386/tsc.h + * + * i386 TSC related functions + */ +#ifndef _ASM_i386_TSC_H +#define _ASM_i386_TSC_H + +#include +#include + +/* + * Standard way to access the cycle counter on i586+ CPUs. + * Currently only used on SMP. + * + * If you really have a SMP machine with i486 chips or older, + * compile for that, and this will just always return zero. + * That's ok, it just means that the nicer scheduling heuristics + * won't work for you. + * + * We only use the low 32 bits, and we'd simply better make sure + * that we reschedule before that wraps. Scheduling at least every + * four billion cycles just basically sounds like a good idea, + * regardless of how fast the machine is. + */ +typedef unsigned long long cycles_t; + +static inline cycles_t get_cycles (void) +{ + unsigned long long ret=0; + +#ifndef CONFIG_X86_TSC + if (!cpu_has_tsc) + return 0; +#endif + +#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) + rdtscll(ret); +#endif + return ret; +} + +extern unsigned int cpu_khz; +extern unsigned int tsc_khz; +extern void tsc_init(void); +void tsc_c3_compensate(unsigned long usecs); +extern void mark_tsc_unstable(void); +#endif Index: linux-2.6.14/arch/i386/kernel/setup.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/setup.c +++ linux-2.6.14/arch/i386/kernel/setup.c @@ -1612,6 +1612,7 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif #endif + tsc_init(); } #include "setup_arch_post.h" Index: linux-2.6.14/drivers/acpi/processor_idle.c =================================================================== --- linux-2.6.14.orig/drivers/acpi/processor_idle.c +++ linux-2.6.14/drivers/acpi/processor_idle.c @@ -166,6 +166,7 @@ acpi_processor_power_activate(struct acp return; } +extern void tsc_c3_compensate(unsigned long nsecs); static atomic_t c3_cpu_count; static void acpi_processor_idle(void) @@ -334,6 +335,11 @@ static void acpi_processor_idle(void) ACPI_MTX_DO_NOT_LOCK); } +#ifdef CONFIG_GENERIC_TIME + /* compensate for TSC pause */ + tsc_c3_compensate((u32)(((u64)((t2-t1)&0xFFFFFF)*286070)>>10)); +#endif + /* Re-enable interrupts */ local_irq_enable(); /* Compute time (ticks) that we were actually asleep */ Index: linux-2.6.14/include/asm-i386/mach-default/mach_timer.h =================================================================== --- linux-2.6.14.orig/include/asm-i386/mach-default/mach_timer.h +++ linux-2.6.14/include/asm-i386/mach-default/mach_timer.h @@ -15,7 +15,9 @@ #ifndef _MACH_TIMER_H #define _MACH_TIMER_H -#define CALIBRATE_LATCH (5 * LATCH) +#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ +#define CALIBRATE_LATCH \ + ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) static inline void mach_prepare_counter(void) { Index: linux-2.6.14/include/asm-i386/mach-summit/mach_mpparse.h =================================================================== --- linux-2.6.14.orig/include/asm-i386/mach-summit/mach_mpparse.h +++ linux-2.6.14/include/asm-i386/mach-summit/mach_mpparse.h @@ -30,6 +30,7 @@ static inline int mps_oem_check(struct m (!strncmp(productid, "VIGIL SMP", 9) || !strncmp(productid, "EXA", 3) || !strncmp(productid, "RUTHLESS SMP", 12))){ + mark_tsc_unstable(); use_cyclone = 1; /*enable cyclone-timer*/ setup_summit(); usb_early_handoff = 1; @@ -44,6 +45,7 @@ static inline int acpi_madt_oem_check(ch if (!strncmp(oem_id, "IBM", 3) && (!strncmp(oem_table_id, "SERVIGIL", 8) || !strncmp(oem_table_id, "EXA", 3))){ + mark_tsc_unstable(); use_cyclone = 1; /*enable cyclone-timer*/ setup_summit(); usb_early_handoff = 1; Index: linux-2.6.14/arch/i386/kernel/acpi/boot.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/acpi/boot.c +++ linux-2.6.14/arch/i386/kernel/acpi/boot.c @@ -570,7 +570,7 @@ static int __init acpi_parse_sbf(unsigne } #ifdef CONFIG_HPET_TIMER - +#include static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) { struct acpi_table_hpet *hpet_tbl; @@ -592,6 +592,7 @@ static int __init acpi_parse_hpet(unsign #ifdef CONFIG_X86_64 vxtime.hpet_address = hpet_tbl->addr.addrl | ((long)hpet_tbl->addr.addrh << 32); + hpet_address = vxtime.hpet_address; printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", hpet_tbl->id, vxtime.hpet_address); @@ -600,10 +601,10 @@ static int __init acpi_parse_hpet(unsign extern unsigned long hpet_address; hpet_address = hpet_tbl->addr.addrl; - printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, hpet_address); } -#endif /* X86 */ +#endif /* X86 */ + printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", + hpet_tbl->id, hpet_address); return 0; } @@ -612,7 +613,8 @@ static int __init acpi_parse_hpet(unsign #endif #ifdef CONFIG_X86_PM_TIMER -extern u32 pmtmr_ioport; +u32 acpi_pmtmr_ioport; +int acpi_pmtmr_buggy; #endif static int __init acpi_parse_fadt(unsigned long phys, unsigned long size) @@ -640,14 +642,15 @@ static int __init acpi_parse_fadt(unsign ACPI_ADR_SPACE_SYSTEM_IO) return 0; - pmtmr_ioport = fadt->xpm_tmr_blk.address; + acpi_pmtmr_ioport = fadt->xpm_tmr_blk.address; } else { /* FADT rev. 1 */ - pmtmr_ioport = fadt->V1_pm_tmr_blk; + acpi_pmtmr_ioport = fadt->V1_pm_tmr_blk; } - if (pmtmr_ioport) - printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", - pmtmr_ioport); + + if (acpi_pmtmr_ioport) + printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", acpi_pmtmr_ioport); + #endif return 0; } Index: linux-2.6.14/arch/i386/Kconfig =================================================================== --- linux-2.6.14.orig/arch/i386/Kconfig +++ linux-2.6.14/arch/i386/Kconfig @@ -14,6 +14,10 @@ config X86 486, 586, Pentiums, and various instruction-set-compatible chips by AMD, Cyrix, and others. +config GENERIC_TIME + bool + default y + config SEMAPHORE_SLEEPERS bool default y @@ -466,6 +470,8 @@ config HPET_EMULATE_RTC depends on HPET_TIMER && RTC=y default y +source "kernel/time/Kconfig" + config SMP bool "Symmetric multi-processing support" ---help--- Index: linux-2.6.14/arch/i386/lib/delay.c =================================================================== --- linux-2.6.14.orig/arch/i386/lib/delay.c +++ linux-2.6.14/arch/i386/lib/delay.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -22,11 +23,20 @@ #include #endif -extern struct timer_opts* timer; - +/* XXX - For now just use a simple loop delay + * This has cpufreq issues, but so did the old method. + */ void __delay(unsigned long loops) { - cur_timer->delay(loops); + int d0; + __asm__ __volatile__( + "\tjmp 1f\n" + ".align 16\n" + "1:\tjmp 2f\n" + ".align 16\n" + "2:\tdecl %0\n\tjns 2b" + :"=&a" (d0) + :"0" (loops)); } inline void __const_udelay(unsigned long xloops) Index: linux-2.6.14/include/asm-i386/timeofday.h =================================================================== --- /dev/null +++ linux-2.6.14/include/asm-i386/timeofday.h @@ -0,0 +1,4 @@ +#ifndef _ASM_I386_TIMEOFDAY_H +#define _ASM_I386_TIMEOFDAY_H +#include +#endif Index: linux-2.6.14/include/asm-i386/timer.h =================================================================== --- linux-2.6.14.orig/include/asm-i386/timer.h +++ linux-2.6.14/include/asm-i386/timer.h @@ -3,68 +3,10 @@ #include #include -/** - * struct timer_ops - used to define a timer source - * - * @name: name of the timer. - * @init: Probes and initializes the timer. Takes clock= override - * string as an argument. Returns 0 on success, anything else - * on failure. - * @mark_offset: called by the timer interrupt. - * @get_offset: called by gettimeofday(). Returns the number of microseconds - * since the last timer interupt. - * @monotonic_clock: returns the number of nanoseconds since the init of the - * timer. - * @delay: delays this many clock cycles. - */ -struct timer_opts { - char* name; - void (*mark_offset)(void); - unsigned long (*get_offset)(void); - unsigned long long (*monotonic_clock)(void); - void (*delay)(unsigned long); - unsigned long (*read_timer)(void); - int (*suspend)(pm_message_t state); - int (*resume)(void); -}; - -struct init_timer_opts { - int (*init)(char *override); - struct timer_opts *opts; -}; - #define TICK_SIZE (tick_nsec / 1000) - -extern struct timer_opts* __init select_timer(void); -extern void clock_fallback(void); void setup_pit_timer(void); - /* Modifiers for buggy PIT handling */ - extern int pit_latch_buggy; - -extern struct timer_opts *cur_timer; -extern int timer_ack; - -/* list of externed timers */ -extern struct timer_opts timer_none; -extern struct timer_opts timer_pit; -extern struct init_timer_opts timer_pit_init; -extern struct init_timer_opts timer_tsc_init; -#ifdef CONFIG_X86_CYCLONE_TIMER -extern struct init_timer_opts timer_cyclone_init; -#endif - -extern unsigned long calibrate_tsc(void); -extern unsigned long read_timer_tsc(void); -extern void init_cpu_khz(void); extern int recalibrate_cpu_khz(void); -#ifdef CONFIG_HPET_TIMER -extern struct init_timer_opts timer_hpet_init; -extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr); -#endif -#ifdef CONFIG_X86_PM_TIMER -extern struct init_timer_opts timer_pmtmr_init; -#endif #endif Index: linux-2.6.14/arch/i386/kernel/timers/Makefile =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/timers/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for x86 timers -# - -obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o - -obj-$(CONFIG_X86_CYCLONE_TIMER) += timer_cyclone.o -obj-$(CONFIG_HPET_TIMER) += timer_hpet.o -obj-$(CONFIG_X86_PM_TIMER) += timer_pm.o Index: linux-2.6.14/arch/i386/kernel/timers/timer.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/timers/timer.c +++ /dev/null @@ -1,75 +0,0 @@ -#include -#include -#include -#include - -#ifdef CONFIG_HPET_TIMER -/* - * HPET memory read is slower than tsc reads, but is more dependable as it - * always runs at constant frequency and reduces complexity due to - * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use - * timer_pit when HPET is active. So, we default to timer_tsc. - */ -#endif -/* list of timers, ordered by preference, NULL terminated */ -static struct init_timer_opts* __initdata timers[] = { -#ifdef CONFIG_X86_CYCLONE_TIMER - &timer_cyclone_init, -#endif -#ifdef CONFIG_HPET_TIMER - &timer_hpet_init, -#endif -#ifdef CONFIG_X86_PM_TIMER - &timer_pmtmr_init, -#endif - &timer_tsc_init, - &timer_pit_init, - NULL, -}; - -static char clock_override[10] __initdata; - -static int __init clock_setup(char* str) -{ - if (str) - strlcpy(clock_override, str, sizeof(clock_override)); - return 1; -} -__setup("clock=", clock_setup); - - -/* The chosen timesource has been found to be bad. - * Fall back to a known good timesource (the PIT) - */ -void clock_fallback(void) -{ - cur_timer = &timer_pit; -} - -/* iterates through the list of timers, returning the first - * one that initializes successfully. - */ -struct timer_opts* __init select_timer(void) -{ - int i = 0; - - /* find most preferred working timer */ - while (timers[i]) { - if (timers[i]->init) - if (timers[i]->init(clock_override) == 0) - return timers[i]->opts; - ++i; - } - - panic("select_timer: Cannot find a suitable timer\n"); - return NULL; -} - -int read_current_timer(unsigned long *timer_val) -{ - if (cur_timer->read_timer) { - *timer_val = cur_timer->read_timer(); - return 0; - } - return -1; -} Index: linux-2.6.14/arch/i386/kernel/timers/timer_cyclone.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/timers/timer_cyclone.c +++ /dev/null @@ -1,259 +0,0 @@ -/* Cyclone-timer: - * This code implements timer_ops for the cyclone counter found - * on IBM x440, x360, and other Summit based systems. - * - * Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com) - */ - - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "io_ports.h" - -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; - -#define CYCLONE_CBAR_ADDR 0xFEB00CD0 -#define CYCLONE_PMCC_OFFSET 0x51A0 -#define CYCLONE_MPMC_OFFSET 0x51D0 -#define CYCLONE_MPCS_OFFSET 0x51A8 -#define CYCLONE_TIMER_FREQ 100000000 -#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */ -int use_cyclone = 0; - -static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */ -static u32 last_cyclone_low; -static u32 last_cyclone_high; -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* helper macro to atomically read both cyclone counter registers */ -#define read_cyclone_counter(low,high) \ - do{ \ - high = cyclone_timer[1]; low = cyclone_timer[0]; \ - } while (high != cyclone_timer[1]); - - -static void mark_offset_cyclone(void) -{ - unsigned long lost, delay; - unsigned long delta = last_cyclone_low; - int count; - unsigned long long this_offset, last_offset; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - - spin_lock(&i8253_lock); - read_cyclone_counter(last_cyclone_low,last_cyclone_high); - - /* read values for delay_at_last_interrupt */ - outb_p(0x00, 0x43); /* latch the count ASAP */ - - count = inb_p(0x40); /* read the latched count */ - count |= inb(0x40) << 8; - - /* - * VIA686a test code... reset the latch if count > max + 1 - * from timer_pit.c - cjb - */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - spin_unlock(&i8253_lock); - - /* lost tick compensation */ - delta = last_cyclone_low - delta; - delta /= (CYCLONE_TIMER_FREQ/1000000); - delta += delay_at_last_interrupt; - lost = delta/(1000000/HZ); - delay = delta%(1000000/HZ); - if (lost >= 2) - jiffies_64 += lost-1; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK; - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - count = ((LATCH-1) - count) * TICK_SIZE; - delay_at_last_interrupt = (count + LATCH/2) / LATCH; - - - /* catch corner case where tick rollover occured - * between cyclone and pit reads (as noted when - * usec delta is > 90% # of usecs/tick) - */ - if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) - jiffies_64++; -} - -static unsigned long get_offset_cyclone(void) -{ - u32 offset; - - if(!cyclone_timer) - return delay_at_last_interrupt; - - /* Read the cyclone timer */ - offset = cyclone_timer[0]; - - /* .. relative to previous jiffy */ - offset = offset - last_cyclone_low; - - /* convert cyclone ticks to microseconds */ - /* XXX slow, can we speed this up? */ - offset = offset/(CYCLONE_TIMER_FREQ/1000000); - - /* our adjusted time offset in microseconds */ - return delay_at_last_interrupt + offset; -} - -static unsigned long long monotonic_clock_cyclone(void) -{ - u32 now_low, now_high; - unsigned long long last_offset, this_offset, base; - unsigned long long ret; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - - /* Read the cyclone counter */ - read_cyclone_counter(now_low,now_high); - this_offset = ((unsigned long long)now_high<<32)|now_low; - - /* convert to nanoseconds */ - ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK); - return ret * (1000000000 / CYCLONE_TIMER_FREQ); -} - -static int __init init_cyclone(char* override) -{ - u32* reg; - u32 base; /* saved cyclone base address */ - u32 pageaddr; /* page that contains cyclone_timer register */ - u32 offset; /* offset from pageaddr to cyclone_timer register */ - int i; - - /* check clock override */ - if (override[0] && strncmp(override,"cyclone",7)) - return -ENODEV; - - /*make sure we're on a summit box*/ - if(!use_cyclone) return -ENODEV; - - printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); - - /* find base address */ - pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK; - offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); - return -ENODEV; - } - base = *reg; - if(!base){ - printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); - return -ENODEV; - } - - /* setup PMCC */ - pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); - return -ENODEV; - } - reg[0] = 0x00000001; - - /* setup MPCS */ - pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); - return -ENODEV; - } - reg[0] = 0x00000001; - - /* map in cyclone_timer */ - pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!cyclone_timer){ - printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); - return -ENODEV; - } - - /*quick test to make sure its ticking*/ - for(i=0; i<3; i++){ - u32 old = cyclone_timer[0]; - int stall = 100; - while(stall--) barrier(); - if(cyclone_timer[0] == old){ - printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); - cyclone_timer = 0; - return -ENODEV; - } - } - - init_cpu_khz(); - - /* Everything looks good! */ - return 0; -} - - -static void delay_cyclone(unsigned long loops) -{ - unsigned long bclock, now; - if(!cyclone_timer) - return; - bclock = cyclone_timer[0]; - do { - rep_nop(); - now = cyclone_timer[0]; - } while ((now-bclock) < loops); -} -/************************************************************/ - -/* cyclone timer_opts struct */ -static struct timer_opts timer_cyclone = { - .name = "cyclone", - .mark_offset = mark_offset_cyclone, - .get_offset = get_offset_cyclone, - .monotonic_clock = monotonic_clock_cyclone, - .delay = delay_cyclone, -}; - -struct init_timer_opts __initdata timer_cyclone_init = { - .init = init_cyclone, - .opts = &timer_cyclone, -}; Index: linux-2.6.14/arch/i386/kernel/timers/timer_hpet.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/timers/timer_hpet.c +++ /dev/null @@ -1,212 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "io_ports.h" -#include "mach_timer.h" -#include - -static unsigned long hpet_usec_quotient __read_mostly; /* convert hpet clks to usec */ -static unsigned long tsc_hpet_quotient __read_mostly; /* convert tsc to hpet clks */ -static unsigned long hpet_last; /* hpet counter value at last tick*/ -static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_mhz * 10^6)) - * ns = cycles * (10^3 / cpu_mhz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^3 * SC / cpu_mhz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_mhz) -{ - cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -static unsigned long long monotonic_clock_hpet(void) -{ - unsigned long long last_offset, this_offset, base; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return base + cycles_2_ns(this_offset - last_offset); -} - -static unsigned long get_offset_hpet(void) -{ - register unsigned long eax, edx; - - eax = hpet_readl(HPET_COUNTER); - eax -= hpet_last; /* hpet delta */ - eax = min(hpet_tick, eax); - /* - * Time offset = (hpet delta) * ( usecs per HPET clock ) - * = (hpet delta) * ( usecs per tick / HPET clocks per tick) - * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) - * - * Where, - * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick - * - * Using a mull instead of a divl saves some cycles in critical path. - */ - ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax); - - /* our adjusted time offset in microseconds */ - return edx; -} - -static void mark_offset_hpet(void) -{ - unsigned long long this_offset, last_offset; - unsigned long offset; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - rdtsc(last_tsc_low, last_tsc_high); - - if (hpet_use_timer) - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - offset = hpet_readl(HPET_COUNTER); - if (unlikely(((offset - hpet_last) >= (2*hpet_tick)) && (hpet_last != 0))) { - int lost_ticks = ((offset - hpet_last) / hpet_tick) - 1; - jiffies_64 += lost_ticks; - } - hpet_last = offset; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); -} - -static void delay_hpet(unsigned long loops) -{ - unsigned long hpet_start, hpet_end; - unsigned long eax; - - /* loops is the number of cpu cycles. Convert it to hpet clocks */ - ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops); - - hpet_start = hpet_readl(HPET_COUNTER); - do { - rep_nop(); - hpet_end = hpet_readl(HPET_COUNTER); - } while ((hpet_end - hpet_start) < (loops)); -} - -static struct timer_opts timer_hpet; - -static int __init init_hpet(char* override) -{ - unsigned long result, remain; - - /* check clock override */ - if (override[0] && strncmp(override,"hpet",4)) - return -ENODEV; - - if (!is_hpet_enabled()) - return -ENODEV; - - printk("Using HPET for gettimeofday\n"); - if (cpu_has_tsc) { - unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient); - if (tsc_quotient) { - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - ASM_DIV64_REG(cpu_khz, edx, tsc_quotient, - eax, edx); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - set_cyc2ns_scale(cpu_khz/1000); - } - /* set this only when cpu_has_tsc */ - timer_hpet.read_timer = read_timer_tsc; - } - - /* - * Math to calculate hpet to usec multiplier - * Look for the comments at get_offset_hpet() - */ - ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC); - if (remain > (hpet_tick >> 1)) - result++; /* rounding the result */ - hpet_usec_quotient = result; - - return 0; -} - -static int hpet_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - rdtsc(last_tsc_low, last_tsc_high); - - if (hpet_use_timer) - hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - hpet_last = hpet_readl(HPET_COUNTER); - write_sequnlock(&monotonic_lock); - return 0; -} -/************************************************************/ - -/* tsc timer_opts struct */ -static struct timer_opts timer_hpet __read_mostly = { - .name = "hpet", - .mark_offset = mark_offset_hpet, - .get_offset = get_offset_hpet, - .monotonic_clock = monotonic_clock_hpet, - .delay = delay_hpet, - .resume = hpet_resume, -}; - -struct init_timer_opts __initdata timer_hpet_init = { - .init = init_hpet, - .opts = &timer_hpet, -}; Index: linux-2.6.14/arch/i386/kernel/timers/timer_none.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/timers/timer_none.c +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include - -static void mark_offset_none(void) -{ - /* nothing needed */ -} - -static unsigned long get_offset_none(void) -{ - return 0; -} - -static unsigned long long monotonic_clock_none(void) -{ - return 0; -} - -static void delay_none(unsigned long loops) -{ - int d0; - __asm__ __volatile__( - "\tjmp 1f\n" - ".align 16\n" - "1:\tjmp 2f\n" - ".align 16\n" - "2:\tdecl %0\n\tjns 2b" - :"=&a" (d0) - :"0" (loops)); -} - -/* none timer_opts struct */ -struct timer_opts timer_none = { - .name = "none", - .mark_offset = mark_offset_none, - .get_offset = get_offset_none, - .monotonic_clock = monotonic_clock_none, - .delay = delay_none, -}; Index: linux-2.6.14/arch/i386/kernel/timers/timer_pm.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/timers/timer_pm.c +++ /dev/null @@ -1,268 +0,0 @@ -/* - * (C) Dominik Brodowski 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "mach_timer.h" - -/* Number of PMTMR ticks expected during calibration run */ -#define PMTMR_TICKS_PER_SEC 3579545 -#define PMTMR_EXPECTED_RATE \ - ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10)) - - -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/acpi/boot.c */ -u32 pmtmr_ioport = 0; - - -/* value of the Power timer at last timer interrupt */ -static u32 offset_tick; -static u32 offset_delay; - -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - -/*helper function to safely read acpi pm timesource*/ -static inline u32 read_pmtmr(void) -{ - u32 v1=0,v2=0,v3=0; - /* It has been reported that because of various broken - * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time - * source is not latched, so you must read it multiple - * times to insure a safe value is read. - */ - do { - v1 = inl(pmtmr_ioport); - v2 = inl(pmtmr_ioport); - v3 = inl(pmtmr_ioport); - } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) - || (v3 > v1 && v3 < v2)); - - /* mask the output to 24 bits */ - return v2 & ACPI_PM_MASK; -} - - -/* - * Some boards have the PMTMR running way too fast. We check - * the PMTMR rate against PIT channel 2 to catch these cases. - */ -static int verify_pmtmr_rate(void) -{ - u32 value1, value2; - unsigned long count, delta; - - mach_prepare_counter(); - value1 = read_pmtmr(); - mach_countup(&count); - value2 = read_pmtmr(); - delta = (value2 - value1) & ACPI_PM_MASK; - - /* Check that the PMTMR delta is within 5% of what we expect */ - if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 || - delta > (PMTMR_EXPECTED_RATE * 21) / 20) { - printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE); - return -1; - } - - return 0; -} - - -static int init_pmtmr(char* override) -{ - u32 value1, value2; - unsigned int i; - - if (override[0] && strncmp(override,"pmtmr",5)) - return -ENODEV; - - if (!pmtmr_ioport) - return -ENODEV; - - /* we use the TSC for delay_pmtmr, so make sure it exists */ - if (!cpu_has_tsc) - return -ENODEV; - - /* "verify" this timing source */ - value1 = read_pmtmr(); - for (i = 0; i < 10000; i++) { - value2 = read_pmtmr(); - if (value2 == value1) - continue; - if (value2 > value1) - goto pm_good; - if ((value2 < value1) && ((value2) < 0xFFF)) - goto pm_good; - printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2); - return -EINVAL; - } - printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1); - return -ENODEV; - -pm_good: - if (verify_pmtmr_rate() != 0) - return -ENODEV; - - init_cpu_khz(); - return 0; -} - -static inline u32 cyc2us(u32 cycles) -{ - /* The Power Management Timer ticks at 3.579545 ticks per microsecond. - * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] - * - * Even with HZ = 100, delta is at maximum 35796 ticks, so it can - * easily be multiplied with 286 (=0x11E) without having to fear - * u32 overflows. - */ - cycles *= 286; - return (cycles >> 10); -} - -/* - * this gets called during each timer interrupt - * - Called while holding the writer xtime_lock - */ -static void mark_offset_pmtmr(void) -{ - u32 lost, delta, last_offset; - static int first_run = 1; - last_offset = offset_tick; - - write_seqlock(&monotonic_lock); - - offset_tick = read_pmtmr(); - - /* calculate tick interval */ - delta = (offset_tick - last_offset) & ACPI_PM_MASK; - - /* convert to usecs */ - delta = cyc2us(delta); - - /* update the monotonic base value */ - monotonic_base += delta * NSEC_PER_USEC; - write_sequnlock(&monotonic_lock); - - /* convert to ticks */ - delta += offset_delay; - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - - /* compensate for lost ticks */ - if (lost >= 2) - jiffies_64 += lost - 1; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } -} - -static int pmtmr_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - offset_tick = read_pmtmr(); - write_sequnlock(&monotonic_lock); - return 0; -} - -static unsigned long long monotonic_clock_pmtmr(void) -{ - u32 last_offset, this_offset; - unsigned long long base, ret; - unsigned seq; - - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = offset_tick; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the pmtmr */ - this_offset = read_pmtmr(); - - /* convert to nanoseconds */ - ret = (this_offset - last_offset) & ACPI_PM_MASK; - ret = base + (cyc2us(ret) * NSEC_PER_USEC); - return ret; -} - -static void delay_pmtmr(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - - -/* - * get the offset (in microseconds) from the last call to mark_offset() - * - Called holding a reader xtime_lock - */ -static unsigned long get_offset_pmtmr(void) -{ - u32 now, offset, delta = 0; - - offset = offset_tick; - now = read_pmtmr(); - delta = (now - offset)&ACPI_PM_MASK; - - return (unsigned long) offset_delay + cyc2us(delta); -} - - -/* acpi timer_opts struct */ -static struct timer_opts timer_pmtmr = { - .name = "pmtmr", - .mark_offset = mark_offset_pmtmr, - .get_offset = get_offset_pmtmr, - .monotonic_clock = monotonic_clock_pmtmr, - .delay = delay_pmtmr, - .read_timer = read_timer_tsc, - .resume = pmtmr_resume, -}; - -struct init_timer_opts __initdata timer_pmtmr_init = { - .init = init_pmtmr, - .opts = &timer_pmtmr, -}; - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Dominik Brodowski "); -MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86"); Index: linux-2.6.14/arch/x86_64/Kconfig =================================================================== --- linux-2.6.14.orig/arch/x86_64/Kconfig +++ linux-2.6.14/arch/x86_64/Kconfig @@ -24,6 +24,14 @@ config X86 bool default y +config GENERIC_TIME + bool + default y + +config GENERIC_TIME_VSYSCALL + bool + default y + config SEMAPHORE_SLEEPERS bool default y Index: linux-2.6.14/arch/x86_64/kernel/Makefile =================================================================== --- linux-2.6.14.orig/arch/x86_64/kernel/Makefile +++ linux-2.6.14/arch/x86_64/kernel/Makefile @@ -29,7 +29,6 @@ obj-$(CONFIG_GART_IOMMU) += pci-gart.o a obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o obj-$(CONFIG_MODULES) += module.o Index: linux-2.6.14/arch/x86_64/kernel/pmtimer.c =================================================================== --- linux-2.6.14.orig/arch/x86_64/kernel/pmtimer.c +++ /dev/null @@ -1,101 +0,0 @@ -/* Ported over from i386 by AK, original copyright was: - * - * (C) Dominik Brodowski 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - * - * Dropped all the hardware bug workarounds for now. Hopefully they - * are not needed on 64bit chipsets. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/kernel/acpi/boot.c */ -u32 pmtmr_ioport; - -/* value of the Power timer at last timer interrupt */ -static u32 offset_delay; -static u32 last_pmtmr_tick; - -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - -static inline u32 cyc2us(u32 cycles) -{ - /* The Power Management Timer ticks at 3.579545 ticks per microsecond. - * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] - * - * Even with HZ = 100, delta is at maximum 35796 ticks, so it can - * easily be multiplied with 286 (=0x11E) without having to fear - * u32 overflows. - */ - cycles *= 286; - return (cycles >> 10); -} - -int pmtimer_mark_offset(void) -{ - static int first_run = 1; - unsigned long tsc; - u32 lost; - - u32 tick = inl(pmtmr_ioport); - u32 delta; - - delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK); - - last_pmtmr_tick = tick; - monotonic_base += delta * NSEC_PER_USEC; - - delta += offset_delay; - - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - rdtscll(tsc); - vxtime.last_tsc = tsc - offset_delay * cpu_khz; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } - - return lost - 1; -} - -unsigned int do_gettimeoffset_pm(void) -{ - u32 now, offset, delta = 0; - - offset = last_pmtmr_tick; - now = inl(pmtmr_ioport); - delta = (now - offset) & ACPI_PM_MASK; - - return offset_delay + cyc2us(delta); -} - - -static int __init nopmtimer_setup(char *s) -{ - pmtmr_ioport = 0; - return 0; -} - -__setup("nopmtimer", nopmtimer_setup); Index: linux-2.6.14/arch/x86_64/kernel/vmlinux.lds.S =================================================================== --- linux-2.6.14.orig/arch/x86_64/kernel/vmlinux.lds.S +++ linux-2.6.14/arch/x86_64/kernel/vmlinux.lds.S @@ -99,6 +99,13 @@ SECTIONS .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } jiffies = VVIRT(.jiffies); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) } + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + + .vsyscall_gtod_lock : AT(VLOAD(.vsyscall_gtod_lock)) { *(.vsyscall_gtod_lock) } + vsyscall_gtod_lock = VVIRT(.vsyscall_gtod_lock); + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } Index: linux-2.6.14/arch/x86_64/kernel/vsyscall.c =================================================================== --- linux-2.6.14.orig/arch/x86_64/kernel/vsyscall.c +++ linux-2.6.14/arch/x86_64/kernel/vsyscall.c @@ -19,6 +19,8 @@ * want per guest time just set the kernel.vsyscall64 sysctl to 0. */ +#include +#include #include #include #include @@ -40,6 +42,21 @@ int __sysctl_vsyscall __section_sysctl_vsyscall = 1; seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; + +struct vsyscall_gtod_data_t { + struct timeval wall_time_tv; + struct timezone sys_tz; + cycle_t offset_base; + struct clocksource clock; +}; + +extern struct vsyscall_gtod_data_t vsyscall_gtod_data; +struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data; + +extern seqlock_t vsyscall_gtod_lock; +seqlock_t __vsyscall_gtod_lock __section_vsyscall_gtod_lock = SEQLOCK_UNLOCKED; + + #include static force_inline void timeval_normalize(struct timeval * tv) @@ -53,40 +70,54 @@ static force_inline void timeval_normali } } -static force_inline void do_vgettimeofday(struct timeval * tv) +/* XXX - this is ugly. gettimeofday() has a label in it so we can't + call it twice. + */ +static force_inline int syscall_gtod(struct timeval *tv, struct timezone *tz) { - long sequence, t; - unsigned long sec, usec; - + int ret; + asm volatile("syscall" + : "=a" (ret) + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); + return ret; +} +static force_inline void do_vgettimeofday(struct timeval* tv) +{ + cycle_t now, cycle_delta; + nsec_t nsec_delta; + unsigned long seq; do { - sequence = read_seqbegin(&__xtime_lock); - - sec = __xtime.tv_sec; - usec = (__xtime.tv_nsec / 1000) + - (__jiffies - __wall_jiffies) * (1000000 / HZ); - - if (__vxtime.mode != VXTIME_HPET) { - sync_core(); - rdtscll(t); - if (t < __vxtime.last_tsc) - t = __vxtime.last_tsc; - usec += ((t - __vxtime.last_tsc) * - __vxtime.tsc_quot) >> 32; - /* See comment in x86_64 do_gettimeofday. */ - } else { - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - - __vxtime.last) * __vxtime.quot) >> 32; + seq = read_seqbegin(&__vsyscall_gtod_lock); + + if (__vsyscall_gtod_data.clock.type == CLOCKSOURCE_FUNCTION) { + syscall_gtod(tv, NULL); + return; } - } while (read_seqretry(&__xtime_lock, sequence)); - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; + /* read the timeosurce and calc cycle_delta */ + now = read_clocksource(&__vsyscall_gtod_data.clock); + cycle_delta = (now - __vsyscall_gtod_data.offset_base) + & __vsyscall_gtod_data.clock.mask; + + /* convert cycles to nsecs */ + nsec_delta = cycle_delta * __vsyscall_gtod_data.clock.mult; + nsec_delta = nsec_delta >> __vsyscall_gtod_data.clock.shift; + + /* add nsec offset to wall_time_tv */ + *tv = __vsyscall_gtod_data.wall_time_tv; + do_div(nsec_delta, NSEC_PER_USEC); + tv->tv_usec += (unsigned long) nsec_delta; + while (tv->tv_usec > USEC_PER_SEC) { + tv->tv_sec += 1; + tv->tv_usec -= USEC_PER_SEC; + } + } while (read_seqretry(&__vsyscall_gtod_lock, seq)); } /* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ static force_inline void do_get_tz(struct timezone * tz) { - *tz = __sys_tz; + *tz = __vsyscall_gtod_data.sys_tz; } static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz) @@ -122,11 +153,13 @@ int __vsyscall(0) vgettimeofday(struct t * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { + struct timeval tv; if (unlikely(!__sysctl_vsyscall)) return time_syscall(t); - else if (t) - *t = __xtime.tv_sec; - return __xtime.tv_sec; + vgettimeofday(&tv, 0); + if (t) + *t = tv.tv_sec; + return tv.tv_sec; } long __vsyscall(2) venosys_0(void) @@ -139,6 +172,49 @@ long __vsyscall(3) venosys_1(void) return -ENOSYS; } +struct clocksource* curr_clock; + +void arch_update_vsyscall_gtod(struct timespec wall_time, cycle_t offset_base, + struct clocksource *clock, int ntp_adj) +{ + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_lock, flags); + + /* XXX - hackitty hack hack. this is terrible! */ + if (curr_clock != clock) { + if ((clock->type == CLOCKSOURCE_MMIO_32) + || (clock->type == CLOCKSOURCE_MMIO_64)) { + unsigned long vaddr = (unsigned long)clock->mmio_ptr; + pgd_t *pgd = pgd_offset_k(vaddr); + pud_t *pud = pud_offset(pgd, vaddr); + pmd_t *pmd = pmd_offset(pud,vaddr); + pte_t *pte = pte_offset_kernel(pmd, vaddr); + *pte = pte_mkread(*pte); + } + curr_clock = clock; + } + + /* save off wall time as timeval */ + vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time.tv_sec; + vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time.tv_nsec/1000; + + /* save offset_base */ + vsyscall_gtod_data.offset_base = offset_base; + + /* copy current clocksource */ + vsyscall_gtod_data.clock = *clock; + + /* apply ntp adjustment to clocksource mult */ + vsyscall_gtod_data.clock.mult += ntp_adj; + + /* save off current timezone */ + vsyscall_gtod_data.sys_tz = sys_tz; + + write_sequnlock_irqrestore(&vsyscall_gtod_lock, flags); + +} + #ifdef CONFIG_SYSCTL #define SYSCALL 0x050f @@ -217,6 +293,7 @@ static int __init vsyscall_init(void) BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); map_vsyscall(); + sysctl_vsyscall = 1; #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2, 0); #endif Index: linux-2.6.14/include/asm-generic/div64.h =================================================================== --- linux-2.6.14.orig/include/asm-generic/div64.h +++ linux-2.6.14/include/asm-generic/div64.h @@ -55,4 +55,13 @@ extern uint32_t __div64_32(uint64_t *div #endif /* BITS_PER_LONG */ +#ifndef div_long_long_rem +#define div_long_long_rem(dividend,divisor,remainder) \ +({ \ + u64 result = dividend; \ + *remainder = do_div(result,divisor); \ + result; \ +}) +#endif + #endif /* _ASM_GENERIC_DIV64_H */ Index: linux-2.6.14/include/asm-x86_64/hpet.h =================================================================== --- linux-2.6.14.orig/include/asm-x86_64/hpet.h +++ linux-2.6.14/include/asm-x86_64/hpet.h @@ -1,6 +1,6 @@ #ifndef _ASM_X8664_HPET_H #define _ASM_X8664_HPET_H 1 - +#include /* * Documentation on HPET can be found at: * http://www.intel.com/ial/home/sp/pcmmspec.htm @@ -44,6 +44,7 @@ #define HPET_TN_SETVAL 0x040 #define HPET_TN_32BIT 0x100 +extern unsigned long hpet_address; /* hpet memory map physical address */ extern int is_hpet_enabled(void); extern int hpet_rtc_timer_init(void); extern int oem_force_hpet_timer(void); Index: linux-2.6.14/include/asm-x86_64/timeofday.h =================================================================== --- /dev/null +++ linux-2.6.14/include/asm-x86_64/timeofday.h @@ -0,0 +1,4 @@ +#ifndef _ASM_X86_64_TIMEOFDAY_H +#define _ASM_X86_64_TIMEOFDAY_H +#include +#endif Index: linux-2.6.14/include/asm-x86_64/timex.h =================================================================== --- linux-2.6.14.orig/include/asm-x86_64/timex.h +++ linux-2.6.14/include/asm-x86_64/timex.h @@ -24,6 +24,8 @@ static inline cycles_t get_cycles (void) } extern unsigned int cpu_khz; +extern unsigned int tsc_khz; +extern void tsc_c3_compensate(unsigned long usecs); extern int read_current_timer(unsigned long *timer_value); #define ARCH_HAS_READ_CURRENT_TIMER 1 Index: linux-2.6.14/include/asm-x86_64/vsyscall.h =================================================================== --- linux-2.6.14.orig/include/asm-x86_64/vsyscall.h +++ linux-2.6.14/include/asm-x86_64/vsyscall.h @@ -22,6 +22,8 @@ enum vsyscall_num { #define __section_sysctl_vsyscall __attribute__ ((unused, __section__ (".sysctl_vsyscall"), aligned(16))) #define __section_xtime __attribute__ ((unused, __section__ (".xtime"), aligned(16))) #define __section_xtime_lock __attribute__ ((unused, __section__ (".xtime_lock"), aligned(16))) +#define __section_vsyscall_gtod_data __attribute__ ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) +#define __section_vsyscall_gtod_lock __attribute__ ((unused, __section__ (".vsyscall_gtod_lock"),aligned(16))) #define VXTIME_TSC 1 #define VXTIME_HPET 2 Index: linux-2.6.14/drivers/Makefile =================================================================== --- linux-2.6.14.orig/drivers/Makefile +++ linux-2.6.14/drivers/Makefile @@ -67,3 +67,4 @@ obj-$(CONFIG_INFINIBAND) += infiniband/ obj-$(CONFIG_SGI_IOC4) += sn/ obj-y += firmware/ obj-$(CONFIG_CRYPTO) += crypto/ +obj-$(CONFIG_GENERIC_TIME) += clocksource/ Index: linux-2.6.14/drivers/clocksource/Makefile =================================================================== --- /dev/null +++ linux-2.6.14/drivers/clocksource/Makefile @@ -0,0 +1,4 @@ +#XXX doesn't boot! obj-$(CONFIG_X86) += tsc-interp.o +obj-$(CONFIG_X86_CYCLONE_TIMER) += cyclone.o +obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o +obj-$(CONFIG_HPET_TIMER) += hpet.o Index: linux-2.6.14/drivers/clocksource/acpi_pm.c =================================================================== --- /dev/null +++ linux-2.6.14/drivers/clocksource/acpi_pm.c @@ -0,0 +1,153 @@ +/* + * linux/drivers/clocksource/acpi_pm.c + * + * This file contains the ACPI PM based clocksource. + * + * This code was largely moved from the i386 timer_pm.c file + * which was (C) Dominik Brodowski 2003 + * and contained the following comments: + * + * Driver to use the Power Management Timer (PMTMR) available in some + * southbridges as primary timing source for the Linux kernel. + * + * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, + * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. + * + * This file is licensed under the GPL v2. + */ + + +#include +#include +#include +#include + +/* Number of PMTMR ticks expected during calibration run */ +#define PMTMR_TICKS_PER_SEC 3579545 + +#if (defined(CONFIG_X86) && (!defined(CONFIG_X86_64))) +#include "mach_timer.h" +#define PMTMR_EXPECTED_RATE ((PMTMR_TICKS_PER_SEC*CALIBRATE_TIME_MSEC)/1000) +#endif + +/* The I/O port the PMTMR resides at. + * The location is detected during setup_arch(), + * in arch/i386/acpi/boot.c */ +extern u32 acpi_pmtmr_ioport; +extern int acpi_pmtmr_buggy; + +#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ + + +static inline u32 read_pmtmr(void) +{ + /* mask the output to 24 bits */ + return inl(acpi_pmtmr_ioport) & ACPI_PM_MASK; +} + +static cycle_t acpi_pm_read_verified(void) +{ + u32 v1=0,v2=0,v3=0; + /* It has been reported that because of various broken + * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM clock + * source is not latched, so you must read it multiple + * times to insure a safe value is read. + */ + do { + v1 = read_pmtmr(); + v2 = read_pmtmr(); + v3 = read_pmtmr(); + } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) + || (v3 > v1 && v3 < v2)); + + return (cycle_t)v2; +} + + +static cycle_t acpi_pm_read(void) +{ + return (cycle_t)read_pmtmr(); +} + +struct clocksource clocksource_acpi_pm = { + .name = "acpi_pm", + .rating = 200, + .type = CLOCKSOURCE_FUNCTION, + .read_fnct = acpi_pm_read, + .mask = (cycle_t)ACPI_PM_MASK, + .mult = 0, /*to be caluclated*/ + .shift = 22, + .is_continuous = 1, +}; + +#if (defined(CONFIG_X86) && (!defined(CONFIG_X86_64))) +/* + * Some boards have the PMTMR running way too fast. We check + * the PMTMR rate against PIT channel 2 to catch these cases. + */ +static int __init verify_pmtmr_rate(void) +{ + u32 value1, value2; + unsigned long count, delta; + + mach_prepare_counter(); + value1 = read_pmtmr(); + mach_countup(&count); + value2 = read_pmtmr(); + delta = (value2 - value1) & ACPI_PM_MASK; + + /* Check that the PMTMR delta is within 5% of what we expect */ + if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 || + delta > (PMTMR_EXPECTED_RATE * 21) / 20) { + printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE); + return -1; + } + + return 0; +} +#else +#define verify_pmtmr_rate() (0) +#endif + +static int __init init_acpi_pm_clocksource(void) +{ + u32 value1, value2; + unsigned int i; + + if (!acpi_pmtmr_ioport) + return -ENODEV; + + clocksource_acpi_pm.mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, + clocksource_acpi_pm.shift); + + /* "verify" this timing source */ + value1 = read_pmtmr(); + for (i = 0; i < 10000; i++) { + value2 = read_pmtmr(); + if (value2 == value1) + continue; + if (value2 > value1) + goto pm_good; + if ((value2 < value1) && ((value2) < 0xFFF)) + goto pm_good; + printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2); + return -EINVAL; + } + printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1); + return -ENODEV; + +pm_good: + if (verify_pmtmr_rate() != 0) + return -ENODEV; + + /* check to see if pmtmr is known buggy */ + if (acpi_pmtmr_buggy) { + clocksource_acpi_pm.read_fnct = acpi_pm_read_verified; + clocksource_acpi_pm.rating = 110; + } + + register_clocksource(&clocksource_acpi_pm); + return 0; +} + +module_init(init_acpi_pm_clocksource); Index: linux-2.6.14/drivers/clocksource/cyclone.c =================================================================== --- /dev/null +++ linux-2.6.14/drivers/clocksource/cyclone.c @@ -0,0 +1,139 @@ +#include +#include +#include +#include +#include + +#include +#include +#include "mach_timer.h" + +#define CYCLONE_CBAR_ADDR 0xFEB00CD0 /* base address ptr*/ +#define CYCLONE_PMCC_OFFSET 0x51A0 /* offset to control register */ +#define CYCLONE_MPCS_OFFSET 0x51A8 /* offset to select register */ +#define CYCLONE_MPMC_OFFSET 0x51D0 /* offset to count register */ +#define CYCLONE_TIMER_FREQ 100000000 +#define CYCLONE_TIMER_MASK (0xFFFFFFFF) /* 32 bit mask */ + +int use_cyclone = 0; + +struct clocksource clocksource_cyclone = { + .name = "cyclone", + .rating = 250, + .type = CLOCKSOURCE_MMIO_32, + .mmio_ptr = NULL, /* to be set */ + .mask = (cycle_t)CYCLONE_TIMER_MASK, + .mult = 10, + .shift = 0, + .is_continuous = 1, +}; + +static unsigned long __init calibrate_cyclone(void) +{ + u64 delta64; + unsigned long start, end; + unsigned long i, count; + unsigned long cyclone_freq_khz; + + /* repeat 3 times to make sure the cache is warm */ + for(i=0; i < 3; i++) { + mach_prepare_counter(); + start = readl(clocksource_cyclone.mmio_ptr); + mach_countup(&count); + end = readl(clocksource_cyclone.mmio_ptr); + } + + delta64 = end - start; + + delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */ + do_div(delta64,CALIBRATE_TIME_MSEC); + + cyclone_freq_khz = (unsigned long)delta64; + + printk("calculated cyclone_freq: %lu khz\n", cyclone_freq_khz); + return cyclone_freq_khz; +} + +static int __init init_cyclone_clocksource(void) +{ + unsigned long base; /* saved value from CBAR */ + unsigned long offset; + u32 __iomem* reg; + u32 __iomem* volatile cyclone_timer; /* Cyclone MPMC0 register */ + unsigned long khz; + int i; + + /*make sure we're on a summit box*/ + if (!use_cyclone) return -ENODEV; + + printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); + + /* find base address */ + offset = CYCLONE_CBAR_ADDR; + reg = ioremap_nocache(offset, sizeof(reg)); + if(!reg){ + printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); + return -ENODEV; + } + /* even on 64bit systems, this is only 32bits */ + base = readl(reg); + if(!base){ + printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); + return -ENODEV; + } + iounmap(reg); + + /* setup PMCC */ + offset = base + CYCLONE_PMCC_OFFSET; + reg = ioremap_nocache(offset, sizeof(reg)); + if(!reg){ + printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); + return -ENODEV; + } + writel(0x00000001,reg); + iounmap(reg); + + /* setup MPCS */ + offset = base + CYCLONE_MPCS_OFFSET; + reg = ioremap_nocache(offset, sizeof(reg)); + if(!reg){ + printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); + return -ENODEV; + } + writel(0x00000001,reg); + iounmap(reg); + + /* map in cyclone_timer */ + offset = base + CYCLONE_MPMC_OFFSET; + cyclone_timer = ioremap_nocache(offset, sizeof(u64)); + if(!cyclone_timer){ + printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); + return -ENODEV; + } + + /*quick test to make sure its ticking*/ + for(i=0; i<3; i++){ + u32 old = readl(cyclone_timer); + int stall = 100; + while(stall--) barrier(); + if(readl(cyclone_timer) == old){ + printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); + iounmap(cyclone_timer); + cyclone_timer = NULL; + return -ENODEV; + } + } + clocksource_cyclone.mmio_ptr = cyclone_timer; + + /* sort out mult/shift values */ + khz = calibrate_cyclone(); + clocksource_cyclone.shift = 22; + clocksource_cyclone.mult = clocksource_khz2mult(khz, + clocksource_cyclone.shift); + + register_clocksource(&clocksource_cyclone); + + return 0; +} + +module_init(init_cyclone_clocksource); Index: linux-2.6.14/drivers/clocksource/hpet.c =================================================================== --- /dev/null +++ linux-2.6.14/drivers/clocksource/hpet.c @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include +#include + +#define HPET_MASK (0xFFFFFFFF) +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .type = CLOCKSOURCE_MMIO_32, + .mmio_ptr = NULL, + .mask = (cycle_t)HPET_MASK, + .mult = 0, /* set below */ + .shift = HPET_SHIFT, + .is_continuous = 1, +}; + +static int __init init_hpet_clocksource(void) +{ + unsigned long hpet_period; + void __iomem* hpet_base; + u64 tmp; + + if (!hpet_address) + return -ENODEV; + + /* calculate the hpet address */ + hpet_base = + (void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + clocksource_hpet.mmio_ptr = hpet_base + HPET_COUNTER; + + /* calculate the frequency */ + hpet_period = readl(hpet_base + HPET_PERIOD); + + + /* hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + register_clocksource(&clocksource_hpet); + return 0; +} +module_init(init_hpet_clocksource); Index: linux-2.6.14/drivers/clocksource/tsc-interp.c =================================================================== --- /dev/null +++ linux-2.6.14/drivers/clocksource/tsc-interp.c @@ -0,0 +1,112 @@ +/* TSC-Jiffies Interpolation clocksource + Example interpolation clocksource. +TODO: + o per-cpu TSC offsets +*/ +#include +#include +#include +#include +#include +#include +#include + +static unsigned long current_tsc_khz = 0; + +static seqlock_t tsc_interp_lock = SEQLOCK_UNLOCKED; +static unsigned long tsc_then; +static unsigned long jiffies_then; +struct timer_list tsc_interp_timer; + +static unsigned long mult, shift; + +#define NSEC_PER_JIFFY ((((unsigned long long)NSEC_PER_SEC)<<8)/ACTHZ) +#define SHIFT_VAL 22 + +static cycle_t read_tsc_interp(void); +static void tsc_interp_update_callback(void); + +static struct clocksource clocksource_tsc_interp = { + .name = "tsc-interp", + .rating = 150, + .type = CLOCKSOURCE_FUNCTION, + .read_fnct = read_tsc_interp, + .mask = (cycle_t)((1ULL<<32)-1), + .mult = 1<> shift); + else + ret += (cycle_t)(jiffs_now - jiffs_then)*NSEC_PER_JIFFY; + + return ret; +} + +static void tsc_interp_update_callback(void) +{ + /* only update if tsc_khz has changed */ + if (current_tsc_khz != tsc_khz){ + current_tsc_khz = tsc_khz; + mult = clocksource_khz2mult(current_tsc_khz, shift); + } +} + + +static int __init init_tsc_interp_clocksource(void) +{ + /* TSC initialization is done in arch/i386/kernel/tsc.c */ + if (cpu_has_tsc && tsc_khz) { + current_tsc_khz = tsc_khz; + shift = SHIFT_VAL; + mult = clocksource_khz2mult(current_tsc_khz, shift); + /* setup periodic soft-timer */ + init_timer(&tsc_interp_timer); + tsc_interp_timer.function = tsc_interp_sync; + tsc_interp_timer.expires = jiffies; + add_timer(&tsc_interp_timer); + + register_clocksource(&clocksource_tsc_interp); + } + return 0; +} +module_init(init_tsc_interp_clocksource); Index: linux-2.6.14/kernel/time/Kconfig =================================================================== --- /dev/null +++ linux-2.6.14/kernel/time/Kconfig @@ -0,0 +1,36 @@ +# +# Timer subsystem related configuration options +# + +config KTIME_SCALAR + bool "Ktimers 64bit scalar representation" + depends on !64BIT + default n + help + (You dont want to change this unless you want to hack the + timer code. Just keep it disabled.) + + This option enables the 64bit based scalar representation + of the ktimer internal variables on 32bit systems. On i386 + this results in denser code and slightly better overall + performance. + +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + depends on GENERIC_TIME + help + This option enables high resolution timer support. If your + hardware is not capable then this option only increases + the size of the kernel image. + +config HIGH_RES_RESOLUTION + int "High Resolution Timer resolution (nanoseconds)" + depends on HIGH_RES_TIMERS + default 1000 + help + This sets the resolution in nanoseconds of the high resolution + timers. Too fine a resolution (small a number) will usually + not be observable due to normal system latencies. For an + 800 MHz processor about 10,000 (10 microseconds) is recommended as a + finest resolution. If you don't need that sort of resolution, + larger values may generate less overhead. Index: linux-2.6.14/include/linux/clockchips.h =================================================================== --- /dev/null +++ linux-2.6.14/include/linux/clockchips.h @@ -0,0 +1,127 @@ +/* linux/include/linux/clockchips.h + * + * This file contains the structure definitions for clockchips. + * + * If you are not a clockchip, or the time of day code, you should + * not be including this file! + */ +#ifndef _LINUX_CLOCKCHIPS_H +#define _LINUX_CLOCKCHIPS_H + +#include + +#ifdef CONFIG_GENERIC_TIME + +#include +#include + +/* Clock event modes and commands */ +enum { + CLOCK_EVT_NONE, + CLOCK_EVT_STARTUP, + CLOCK_EVT_PERIODIC, + CLOCK_EVT_ONESHOT, + CLOCK_EVT_IPI, + CLOCK_EVT_STOP, + CLOCK_EVT_SHUTDOWN, + CLOCK_EVT_RUN_CYCLIC, + CLOCK_EVT_SCHEDTICK, + CLOCK_EVT_NOTICK, +}; + +/* Clock event capability flags */ +#define CLOCK_CAP_TICK 0x000001 + +#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_DYNTICK) +#define CLOCK_CAP_NEXTEVT 0x000002 +#else +#define CLOCK_CAP_NEXTEVT 0x000000 +#endif + +#define CLOCK_CAP_UPDATE 0x000004 + +#ifndef CONFIG_PROFILE_NMI +#define CLOCK_CAP_PROFILE 0x000008 +#else +#define CLOCK_CAP_PROFILE 0x000000 +#endif + +#define CLOCK_CAP_MASK (CLOCK_CAP_TICK | CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE) + +/* The device has its own interrupt handler */ +#define CLOCK_HAS_IRQHANDLER 0x010000 + +struct clock_event; + +/** + * struct clock_event - clock event descriptor + * + * @name: ptr to clock event name + * @capabilities: capabilities of the event chip + * @max_delta_ns: maximum delta value in ns + * @min_delta_ns: minimum delta value in ns + * @mult: nanosecond to cycles multiplier + * @shift: nanoseconds to cycles divisor (power of two) + * @set_next_event: set next event + * @set_mode: set mode function + * @suspend: suspend function (optional) + * @resume: resume function (optional) + * @evthandler: Assigned by the framework to be called by the low + * level handler of the event source + * @start_event: called on entry (optional for chip handling...) + * @end_event: called on exit (optional for chip handling...) + * @priv: private device data + */ +struct clock_event { + const char* name; + unsigned int capabilities; + unsigned long max_delta_ns; + unsigned long min_delta_ns; + u32 mult; + u32 shift; + void (*set_next_event)(unsigned long evt); + void (*set_mode)(int mode); + int (*suspend)(void); + int (*resume)(void); + void (*event_handler)(struct pt_regs *regs); + void (*start_event)(void *priv); + void (*end_event)(void *priv); + unsigned int irq; + void *priv; +}; + + + +/* + * Calculate a multiplication factor with shift=32 + */ +static inline unsigned long div_sc32(unsigned long a, unsigned long b) +{ + u64 tmp = ((u64)a) << 32; + do_div(tmp, b); + return (unsigned long) tmp; +} + +static inline unsigned long mpy_sc32(unsigned long a, unsigned long b) +{ + u64 res = (u64) a * b; + + return (unsigned long) (res >> 32); +} + +/* Clock event layer functions */ +extern int setup_local_clockevent(struct clock_event *, cpumask_t cpumask); +extern int setup_global_clockevent(struct clock_event *, cpumask_t cpumask); +extern unsigned long clockevent_delta2ns(unsigned long latch, struct clock_event *evt); +extern void init_clockevents(void); + +extern int clockevents_init_next_event(void); +extern int clockevents_set_next_event(ktime_t expires, ktime_t now); +extern void clockevents_trigger_next_event(void); +extern int clockevents_next_event_available(void); + +#else +# define init_clockevents() do { } while(0) +#endif + +#endif Index: linux-2.6.14/kernel/time/clockevents.c =================================================================== --- /dev/null +++ linux-2.6.14/kernel/time/clockevents.c @@ -0,0 +1,605 @@ +/* + * linux/kernel/time/clockevents.c + * + * This file contains functions which manage clock event drivers. + * + * Copyright(C) 2005 Thomas Gleixner + * + * Kudos to Ingo Molnar for review, criticism, ideas + * + * We have two types of clock event devices: + * - global events (one device per system) + * - local events (one device per cpu) + * + * We assign the various time(r) related interrupts to those devices + * + * - global tick + * - profiling (per cpu) + * - next timer events (per cpu) + * + * TODO: + * - implement variable frequency profiling + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_CLOCK_EVENTS 4 + +struct event_descr { + struct clock_event *event; + unsigned int mode; + unsigned int real_caps; + struct irqaction action; +}; + +struct local_events { + int installed; + struct event_descr events[MAX_CLOCK_EVENTS]; + struct clock_event *nextevt; +}; + +/* Variables related to the global event source */ +static struct event_descr global_eventsource; + +/* Variables related to the per cpu local event sources */ +static DEFINE_PER_CPU(struct local_events, local_eventsources); + +#ifdef CONFIG_SMP +# define recalc_global_event(e) do { } while(0) +#else +# define recalc_global_event(c) recalc_active_event(&global_eventsource, c) +#endif + +/* + * Math helper. Convert a latch value to ns + */ +unsigned long clockevent_delta2ns(unsigned long latch, struct clock_event *evt) +{ + u64 clc = ((u64) latch << evt->shift); + + do_div(clc, evt->mult); + if (clc < KTIME_MONOTONIC_RES) + clc = KTIME_MONOTONIC_RES; + if (clc > 0x7FFFFFFF) + clc = 0x7FFFFFFF; + + return (unsigned long) clc; +} + +/* + * Generic timer interrupt handler usable for all kinds of events + */ +static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct clock_event *evt = dev_id; + + if (evt->start_event) + evt->start_event(evt->priv); + + evt->event_handler(regs); + + if (evt->end_event) + evt->end_event(evt->priv); + + return IRQ_HANDLED; +} + +/* + * Handle tick + */ +static void handle_tick(struct pt_regs *regs) +{ + write_seqlock(&xtime_lock); + do_timer(regs); + write_sequnlock(&xtime_lock); +} + +/* + * Handle tick and update + */ +static void handle_tick_update(struct pt_regs *regs) +{ + write_seqlock(&xtime_lock); + do_timer(regs); + write_sequnlock(&xtime_lock); + + update_process_times(user_mode(regs)); +} + +/* + * Handle tick, update and profiling + */ +static void handle_tick_update_profile(struct pt_regs *regs) +{ + write_seqlock(&xtime_lock); + do_timer(regs); + write_sequnlock(&xtime_lock); + + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING, regs); +} + +/* + * Handle update + */ +static void handle_update(struct pt_regs *regs) +{ + update_process_times(user_mode(regs)); +} + +/* + * Handle update and profile + */ +static void handle_update_profile(struct pt_regs *regs) +{ + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING, regs); +} + +/* + * Handle profile + */ +static void handle_profile(struct pt_regs *regs) +{ + profile_tick(CPU_PROFILING, regs); +} + +/* + * Handle next event + */ +static void handle_nextevent(struct pt_regs *regs) +{ + ktimer_interrupt(); +} + +/* + * Handle next event, tick + */ +static void handle_nextevent_tick(struct pt_regs *regs) +{ + int res; + + res = ktimer_interrupt(); + for (; res > 0; res--) + handle_tick(regs); +} + +/* + * Handle next event, update + */ +static void handle_nextevent_update(struct pt_regs *regs) +{ + if (ktimer_interrupt() > 0) + handle_update(regs); +} + +/* + * Handle next event, tick, update + */ +static void handle_nextevent_tick_update(struct pt_regs *regs) +{ + int res; + + if ((res = ktimer_interrupt()) == 0) + return; + + for (; res > 0; res--) + handle_tick(regs); + + handle_update(regs); +} + +/* + * Handle next event, profile + */ +static void handle_nextevent_profile(struct pt_regs *regs) +{ + if (ktimer_interrupt() > 0) + handle_profile(regs); +} + +/* + * Handle next event, update, profile + */ +static void handle_nextevent_update_profile(struct pt_regs *regs) +{ + if (ktimer_interrupt() > 0) + handle_update_profile(regs); +} + +/* + * Handle next event, tick, update, profile + */ +static void handle_nextevent_all(struct pt_regs *regs) +{ + int res; + + if ((res = ktimer_interrupt()) == 0) + return; + + for (; res > 0; res--) + handle_tick(regs); + + handle_update_profile(regs); +} + +/* + * Lookup table for event assignment + */ +static void *event_handlers[] = { + NULL, /* 0: No capability selected */ + handle_tick, /* 1: Tick only */ + handle_nextevent, /* 2: Next event only */ + handle_nextevent_tick, /* 3: Next event + tick */ + handle_update, /* 4: Update process times */ + handle_tick_update, /* 5: Tick + update process times */ + handle_nextevent_update, /* 6: Next event + + update process times */ + handle_nextevent_tick_update, /* 7: Next event + tick + + update process times */ + handle_profile, /* 8: Profiling int */ + NULL, /* 9: Tick + profiling */ + handle_nextevent_profile, /* A: Next event + profiling */ + NULL, /* B: Next event + tick + profiling */ + handle_update_profile, /* C: Update process times + + profiling */ + handle_tick_update_profile, /* D: Tick + update process times + + profiling */ + handle_nextevent_update_profile,/* E: Next event + + update process times + + profiling */ + handle_nextevent_all, /* F: Next event + tick + + update process times + + profiling */ +}; + +/* + * The selection model makes following assumptions: + * + * There is only one global event source set up. Global event sources + * are unique devices in a system (UP/SMP) Usually they are setup + * early in the bootup phase to provide the basic tick environment to + * bring up hardware. Such a device can be capable of providing all in + * one functionality including next event scheduling. + * + * When a system has decicated event sources which can be used for + * particular purposes then we assume that there are no devices setup + * which provide "competing" functionality. i.e. the developer has to + * decide which device should be used for a particular functionality + * rather than letting the management code guess about the best + * fit. The code manages the cases, where the number of event sources + * is unknown during compile time, but the functionality of the event + * source is assigned to the respective event source by a human best + * fit decision. + * + * The purpose of the management code is to provide handling code for + * the various possible combinations and the necessary infrastructure + * to handle next event (e.g. high resolution) scheduling with a + * single event source, which makes a periodic rescheduling of the + * tick interupt necessary. This is done to avoid the #ifdef mess all + * over the architecture dependend timer and event interupt code for + * the various possible use case combinations and allows clean non + * intrusive implementation of configurable extensions to the time + * related event system e.g. dynamic ticks, high resolution + * timers. + * + * Some architectures can use a NMI based profiling mechanism. If this + * is used, then profiling is excluded from the event assignements. + * + * SMP systems CPU which have no unique global event source should not + * setup a global event source. The correct way is setting up one + * event source (usually local to CPU0 or the bootcpu in hotplug + * systems) which has the CLOCK_CAP_TICK flag set, so the management + * code assigns exactly one tick source for the complete system. + * + * A special case are pseudo event sources (IPI mechanisms) on SMP + * systems. They can be used for populating tick events from one event + * source across multiple CPUs. + * + */ +static int setup_event(struct event_descr *descr, struct clock_event *evt, + unsigned int caps, cpumask_t cpumask) +{ + void *handler = event_handlers[caps]; + + if (!handler) { + printk(KERN_ERR "Unsupported event source %s\n", evt->name); + return -EINVAL; + } + + /* Store the event handler */ + evt->event_handler = handler; + + /* Save the event descriptor reference */ + descr->event = evt; + + if (!(evt->capabilities & CLOCK_HAS_IRQHANDLER)) { + descr->action.name = evt->name; + descr->action.handler = timer_interrupt; + descr->action.flags = SA_INTERRUPT; + descr->action.mask = cpumask; + descr->action.dev_id = evt; + setup_irq(evt->irq, &descr->action); + } + + descr->real_caps = caps; + descr->mode = CLOCK_EVT_STARTUP; + if (evt->set_mode) + evt->set_mode(CLOCK_EVT_STARTUP); + printk(KERN_INFO "Event source %s installed with caps set: %02x\n", + descr->event->name, descr->real_caps); + + return 0; +} + +/* + * Mask out the functionality which is covered by the new event source + * and assign a new event handler. + */ +static unsigned int recalc_active_event(struct event_descr *descr, + unsigned int caps) +{ + unsigned int gcaps; + + if (!descr->event) + return caps; + + /* Find out the overlapping bits */ + gcaps = descr->real_caps & caps; + + /* + * Be careful here. We dont know in which order the event + * sources are set up. So we might switch off a previously + * registered source completely. + * + * Might need more thoughts though. + */ + if (gcaps == descr->real_caps) { + int i; + + i = ffs(gcaps) - 1; + gcaps &= ~(1 << i); + caps &= ~(1 << i); + } + if (!gcaps) + return caps; + + /* Mask the bits which are now covered by the new event */ + descr->real_caps &= ~gcaps; + + /* Assign the new event handler */ + descr->event->event_handler = event_handlers[descr->real_caps]; + printk(KERN_INFO "Event source %s new caps set: %02x\n" , + descr->event->name, descr->real_caps); + + return caps; +} + +/* + * Recalc the events and reassign the handlers if necessary + */ +static int recalc_events(struct local_events *sources, struct clock_event *evt, + cpumask_t cpumask) +{ + unsigned int caps = evt->capabilities & CLOCK_CAP_MASK; + int i; + + if (sources->installed == MAX_CLOCK_EVENTS) + return -ENOSPC; + + if (!event_handlers[caps]) + return -EINVAL; + + recalc_global_event(caps); + + for (i = 0; i < sources->installed; i++) + caps = recalc_active_event(&sources->events[i], caps); + + setup_event(&sources->events[sources->installed], evt, caps, cpumask); + sources->installed++; + if (evt->capabilities & CLOCK_CAP_NEXTEVT) { + sources->nextevt = evt; + ktimer_clock_notify(); + } + + return 0; +} + +/** + * setup_local_clockevent - Set up a cpu local clock event device + * + * @evtdev: event device to be registered + * @cpumask: cpumask for the irq setup + */ +int setup_local_clockevent(struct clock_event *evtdev, cpumask_t cpumask) +{ + struct local_events *sources = &__get_cpu_var(local_eventsources); + unsigned long flags; + int res; + + /* Recalc event sources and maybe reassign interrupts */ + local_irq_save(flags); + res = recalc_events(sources, evtdev, cpumask); + local_irq_restore(flags); + + return res; +} +EXPORT_SYMBOL(setup_local_clockevent); + +/** + * set_global_clockevent - Set the device which generates global clock events + * + * @evt: The device which generates global clock events (ticks) + */ +int __init setup_global_clockevent(struct clock_event *evt, cpumask_t cpumask) +{ + int res; + + res = setup_event(&global_eventsource, evt, + evt->capabilities & CLOCK_CAP_MASK, cpumask); +#ifndef CONFIG_SMP + /* + * The "global" event source on UP systems can serve as + * next event source ! + */ + if (!res && (evt->capabilities & CLOCK_CAP_NEXTEVT)) + per_cpu(local_eventsources, 0).nextevt = evt; +#endif + return res; +} + +/** + * clockevents_next_event_available - Check for a installed next event source + */ +int clockevents_next_event_available(void) +{ + struct local_events *sources = &__get_cpu_var(local_eventsources); + int i; + + if (!sources->nextevt) + return 0; + +#ifndef CONFIG_SMP + if (sources->nextevt == global_eventsource.event) + return CLOCK_EVT_SCHEDTICK; +#endif + /* + * Check, whether the next event source is solely for next events or + * it has to do some periodic tick functionality + * We use the real_caps field here, as some other source might + * have switched off one of the capability flags. + */ + for (i = 0; i < sources->installed; i++) { + if (sources->nextevt != sources->events[i].event) + continue; + + if (sources->events[i].real_caps & ~CLOCK_CAP_NEXTEVT) + return CLOCK_EVT_SCHEDTICK; + return CLOCK_EVT_NOTICK; + } + return CLOCK_EVT_NOTICK; +} + +int clockevents_init_next_event(void) +{ + struct local_events *sources = &__get_cpu_var(local_eventsources); + + if (!sources->nextevt) + return 0; + + if (sources->nextevt->set_mode) + sources->nextevt->set_mode(CLOCK_EVT_ONESHOT); + + return 1; +} + +int clockevents_set_next_event(ktime_t expires, ktime_t now) +{ + struct local_events *sources = &__get_cpu_var(local_eventsources); + nsec_t delta = ktime_to_ns(ktime_sub(expires, now)); + unsigned long clc; + + if (delta <= 0) + return -ETIME; + if (delta > sources->nextevt->max_delta_ns) + delta = sources->nextevt->max_delta_ns; + if (delta < sources->nextevt->min_delta_ns) + delta = sources->nextevt->min_delta_ns; + + clc = mpy_sc32((unsigned long) delta, sources->nextevt->mult); + sources->nextevt->set_next_event(clc); + return 0; +} + +void clockevents_trigger_next_event(void) +{ +} + +#ifdef CONFIG_PM +static int +global_eventsource_suspend(struct sys_device *dev, pm_message_t state) +{ + /* Do generic stuff here */ + if (global_eventsource.event->suspend) + global_eventsource.event->suspend(); + return 0; +} + +static int global_eventsource_resume(struct sys_device *dev) +{ + /* Do generic stuff here */ + if (global_eventsource.event->resume) + global_eventsource.event->resume(); + return 0; +} +#else +# define global_eventsource_resume NULL +# define global_eventsource_suspend NULL +#endif + +static struct sysdev_class global_clock_event_sysclass = { + .resume = global_eventsource_resume, + .suspend = global_eventsource_suspend, + set_kset_name("global_clock_event"), +}; + +static struct sys_device device_global_clock_event = { + .id = 0, + .cls = &global_clock_event_sysclass, +}; + +static int __init global_clock_event_devinit(void) +{ + int error = sysdev_class_register(&global_clock_event_sysclass); + + if (!error) + error = sysdev_register(&device_global_clock_event); + + return error; +} + +device_initcall(global_clock_event_devinit); + +/* + * Functions related to initialization + */ +static void __devinit init_clockevents_cpu(int cpu) +{ +} + +static int __devinit clockevents_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch(action) { + case CPU_UP_PREPARE: + init_clockevents_cpu(cpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + break; +#endif + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata clockevents_nb = { + .notifier_call = clockevents_cpu_notify, +}; + +void __init init_clockevents(void) +{ + clockevents_cpu_notify(&clockevents_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&clockevents_nb); +} Index: linux-2.6.14/arch/i386/kernel/apic.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/apic.c +++ linux-2.6.14/arch/i386/kernel/apic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,23 @@ int enable_local_apic __initdata = 0; /* */ int apic_verbosity; +static unsigned int calibration_result; + +static void lapic_next_event(unsigned long evt); +static void lapic_timer_setup(int mode); + +static struct clock_event lapic_clockevent = { + .name = "lapic", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE | + CLOCK_HAS_IRQHANDLER +#ifdef CONFIG_SMP + | CLOCK_CAP_UPDATE +#endif + , + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, +}; static void apic_pm_activate(void); @@ -92,10 +110,6 @@ void __init apic_intr_init(void) /* Using APIC to generate smp_local_timer_interrupt? */ int using_apic_timer = 0; -static DEFINE_PER_CPU(int, prof_multiplier) = 1; -static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; -static DEFINE_PER_CPU(int, prof_counter) = 1; - static int enabled_via_apicbase; void enable_NMI_through_LVT0 (void * dummy) @@ -869,6 +883,11 @@ fake_ioapic_page: */ /* + * FIXME: Move this to i8253.h. There is no need to keep the access to + * the PIT scattered all around the place -tglx + */ + +/* * The timer chip is already set up at HZ interrupts per second here, * but we do not accept timer interrupts yet. We only allow the BP * to calibrate. @@ -926,12 +945,16 @@ void (*wait_timer_tick)(void) __devinitd #define APIC_DIVISOR 16 -static void __setup_APIC_LVTT(unsigned int clocks) +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot) { unsigned int lvtt_value, tmp_value, ver; ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!APIC_INTEGRATED(ver)) lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); apic_write_around(APIC_LVTT, lvtt_value); @@ -944,31 +967,37 @@ static void __setup_APIC_LVTT(unsigned i & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | APIC_TDR_DIV_16); - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); + if (!oneshot) + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); } -static void __devinit setup_APIC_timer(unsigned int clocks) +static void lapic_next_event(unsigned long evt) +{ + apic_write_around(APIC_TMICT, evt); +} + +static void lapic_timer_setup(int mode) { unsigned long flags; local_irq_save(flags); - - /* - * Wait for IRQ0's slice: - */ - wait_timer_tick(); - - __setup_APIC_LVTT(clocks); - + __setup_APIC_LVTT(calibration_result, mode == CLOCK_EVT_ONESHOT); local_irq_restore(flags); } +static void __devinit setup_APIC_timer(void) +{ + setup_local_clockevent(&lapic_clockevent, CPU_MASK_NONE); +} + /* * In this function we calibrate APIC bus clocks to the external * timer. Unfortunately we cannot use jiffies and the timer irq * to calibrate, since some later bootup code depends on getting * the first irq? Ugh. * + * TODO: Fix this rather than saying "Ugh" -tglx + * * We want to do the calibration only once since we * want to have local timer irqs syncron. CPUs connected * by the same APIC bus have the very same bus frequency. @@ -991,7 +1020,7 @@ static int __init calibrate_APIC_clock(v * value into the APIC clock, we just want to get the * counter running for calibration. */ - __setup_APIC_LVTT(1000000000); + __setup_APIC_LVTT(1000000000, 0); /* * The timer chip counts down to zero. Let's wait @@ -1028,6 +1057,13 @@ static int __init calibrate_APIC_clock(v result = (tt1-tt2)*APIC_DIVISOR/LOOPS; + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc32(tt1-tt2, TICK_NSEC * LOOPS); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + if (cpu_has_tsc) apic_printk(APIC_VERBOSE, "..... CPU clock speed is " "%ld.%04ld MHz.\n", @@ -1042,8 +1078,6 @@ static int __init calibrate_APIC_clock(v return result; } -static unsigned int calibration_result; - void __init setup_boot_APIC_clock(void) { apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"); @@ -1055,14 +1089,14 @@ void __init setup_boot_APIC_clock(void) /* * Now set up the timer for real. */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_enable(); } void __devinit setup_secondary_APIC_clock(void) { - setup_APIC_timer(calibration_result); + setup_APIC_timer(); } void __devinit disable_APIC_timer(void) @@ -1085,6 +1119,8 @@ void enable_APIC_timer(void) } } +static DEFINE_PER_CPU(int, prof_multiplier) = 1; + /* * the frequency of the profiling timer can be changed * by writing a multiplier value into /proc/profile. @@ -1112,60 +1148,6 @@ int setup_profiling_timer(unsigned int m return 0; } - -#undef APIC_DIVISOR - -/* - * Local timer interrupt handler. It does both profiling and - * process statistics/rescheduling. - * - * We do profiling in every local tick, statistics/rescheduling - * happen only every 'profiling multiplier' ticks. The default - * multiplier is 1 and it can be changed by writing the new multiplier - * value into /proc/profile. - */ - -inline void smp_local_timer_interrupt(struct pt_regs * regs) -{ - int cpu = smp_processor_id(); - - profile_tick(CPU_PROFILING, regs); - if (--per_cpu(prof_counter, cpu) <= 0) { - /* - * The multiplier may have changed since the last time we got - * to this point as a result of the user writing to - * /proc/profile. In this case we need to adjust the APIC - * timer accordingly. - * - * Interrupts are already masked off at this point. - */ - per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); - if (per_cpu(prof_counter, cpu) != - per_cpu(prof_old_multiplier, cpu)) { - __setup_APIC_LVTT( - calibration_result/ - per_cpu(prof_counter, cpu)); - per_cpu(prof_old_multiplier, cpu) = - per_cpu(prof_counter, cpu); - } - -#ifdef CONFIG_SMP - update_process_times(user_mode_vm(regs)); -#endif - } - - /* - * We take the 'long' return path, and there every subsystem - * grabs the apropriate locks (kernel lock/ irq lock). - * - * we might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. - */ -} - /* * Local APIC timer interrupt. This is the most natural way for doing * local interrupts, but local timer interrupts can be emulated by @@ -1195,7 +1177,8 @@ fastcall void smp_apic_timer_interrupt(s * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(regs); + if (lapic_clockevent.event_handler) + lapic_clockevent.event_handler(regs); irq_exit(); } Index: linux-2.6.14/arch/i386/kernel/i8259.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/i8259.c +++ linux-2.6.14/arch/i386/kernel/i8259.c @@ -422,12 +422,6 @@ void __init init_IRQ(void) intr_init_hook(); /* - * Set the clock to HZ Hz, we already have a valid - * vector now: - */ - setup_pit_timer(); - - /* * External FPU? Set up irq13 if so, for * original braindamaged IBM FERR coupling. */ Index: linux-2.6.14/arch/i386/kernel/io_apic.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/io_apic.c +++ linux-2.6.14/arch/i386/kernel/io_apic.c @@ -87,6 +87,25 @@ int vector_irq[NR_VECTORS] __read_mostly #define vector_to_irq(vector) (vector) #endif +static int timer_ack; + +void io_apic_timer_ack(void *priv) +{ + if (timer_ack) { + /* + * Subtle, when I/O APICs are used we have to ack timer IRQ + * manually to reset the IRR bit for do_slow_gettimeoffset(). + * This will also deassert NMI lines for the watchdog if run + * on an 82489DX-based system. + */ + spin_lock(&i8259A_lock); + outb(0x0c, PIC_MASTER_OCW3); + /* Ack the IRQ; AEOI will end it automatically. */ + inb(PIC_MASTER_POLL); + spin_unlock(&i8259A_lock); + } +} + /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super Index: linux-2.6.14/arch/i386/kernel/mca.c =================================================================== --- linux-2.6.14.orig/arch/i386/kernel/mca.c +++ linux-2.6.14/arch/i386/kernel/mca.c @@ -472,3 +472,22 @@ void mca_handle_nmi(void) mca_nmi_hook(); } /* mca_handle_nmi */ + +void mca_timer_ack(void *priv) +{ + int irq; + + if (MCA_bus) { + /* The PS/2 uses level-triggered interrupts. You can't + turn them off, nor would you want to (any attempt to + enable edge-triggered interrupts usually gets intercepted by a + special hardware circuit). Hence we have to acknowledge + the timer interrupt. Through some incredibly stupid + design idea, the reset for IRQ 0 is done by setting the + high bit of the PPI port B (0x61). Note that some PS/2s, + notably the 55SX, work fine if this is removed. */ + + irq = inb_p( 0x61 ); /* read the current state */ + outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ + } +} Index: linux-2.6.14/arch/i386/mach-default/setup.c =================================================================== --- linux-2.6.14.orig/arch/i386/mach-default/setup.c +++ linux-2.6.14/arch/i386/mach-default/setup.c @@ -78,8 +78,6 @@ void __init trap_init_hook(void) { } -static struct irqaction irq0 = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL}; - /** * time_init_hook - do any specific initialisations for the system timer. * @@ -89,7 +87,6 @@ static struct irqaction irq0 = { timer_ **/ void __init time_init_hook(void) { - setup_irq(0, &irq0); } #ifdef CONFIG_MCA Index: linux-2.6.14/include/asm-i386/arch_hooks.h =================================================================== --- linux-2.6.14.orig/include/asm-i386/arch_hooks.h +++ linux-2.6.14/include/asm-i386/arch_hooks.h @@ -14,7 +14,6 @@ extern void init_ISA_irqs(void); extern void apic_intr_init(void); extern void smp_intr_init(void); -extern irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs); /* these are the defined hooks */ extern void intr_init_hook(void); Index: linux-2.6.14/include/asm-i386/io_apic.h =================================================================== --- linux-2.6.14.orig/include/asm-i386/io_apic.h +++ linux-2.6.14/include/asm-i386/io_apic.h @@ -204,8 +204,11 @@ extern int io_apic_set_pci_routing (int extern int (*ioapic_renumber_irq)(int ioapic, int irq); +extern void io_apic_timer_ack(void *); + #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 +#define io_apic_timer_ack NULL #endif extern int assign_irq_vector(int irq); Index: linux-2.6.14/include/asm-i386/mach-default/do_timer.h =================================================================== --- linux-2.6.14.orig/include/asm-i386/mach-default/do_timer.h +++ linux-2.6.14/include/asm-i386/mach-default/do_timer.h @@ -1,86 +1,2 @@ /* defines for inline arch setup functions */ -#include -#include - -/** - * do_timer_interrupt_hook - hook into timer tick - * @regs: standard registers from interrupt - * - * Description: - * This hook is called immediately after the timer interrupt is ack'd. - * It's primary purpose is to allow architectures that don't possess - * individual per CPU clocks (like the CPU APICs supply) to broadcast the - * timer interrupt as a means of triggering reschedules etc. - **/ - -static inline void do_timer_interrupt_hook(struct pt_regs *regs) -{ - do_timer(regs); -#ifndef CONFIG_SMP - update_process_times(user_mode(regs)); -#endif -/* - * In the SMP case we use the local APIC timer interrupt to do the - * profiling, except when we simulate SMP mode on a uniprocessor - * system, in that case we have to call the local interrupt handler. - */ -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else - if (!using_apic_timer) - smp_local_timer_interrupt(regs); -#endif -} - - -/* you can safely undefine this if you don't have the Neptune chipset */ - -#define BUGGY_NEPTUN_TIMER - -/** - * do_timer_overflow - process a detected timer overflow condition - * @count: hardware timer interrupt count on overflow - * - * Description: - * This call is invoked when the jiffies count has not incremented but - * the hardware timer interrupt has. It means that a timer tick interrupt - * came along while the previous one was pending, thus a tick was missed - **/ -static inline int do_timer_overflow(int count) -{ - int i; - - spin_lock(&i8259A_lock); - /* - * This is tricky when I/O APICs are used; - * see do_timer_interrupt(). - */ - i = inb(0x20); - spin_unlock(&i8259A_lock); - - /* assumption about timer being IRQ0 */ - if (i & 0x01) { - /* - * We cannot detect lost timer interrupts ... - * well, that's why we call them lost, don't we? :) - * [hmm, on the Pentium and Alpha we can ... sort of] - */ - count -= LATCH; - } else { -#ifdef BUGGY_NEPTUN_TIMER - /* - * for the Neptun bug we know that the 'latch' - * command doesn't latch the high and low value - * of the counter atomically. Thus we have to - * substract 256 from the counter - * ... funny, isnt it? :) - */ - - count -= 256; -#else - printk("do_slow_gettimeoffset(): hardware timer problem?\n"); -#endif - } - return count; -} Index: linux-2.6.14/include/linux/mca.h =================================================================== --- linux-2.6.14.orig/include/linux/mca.h +++ linux-2.6.14/include/linux/mca.h @@ -12,8 +12,10 @@ #include extern int MCA_bus; +extern void mca_timer_ack(void *); #else #define MCA_bus 0 +#define mca_timer_ack NULL #endif /* This sets up an information callback for /proc/mca/slot?. The Index: linux-2.6.14/arch/x86_64/kernel/i8259.c =================================================================== --- linux-2.6.14.orig/arch/x86_64/kernel/i8259.c +++ linux-2.6.14/arch/x86_64/kernel/i8259.c @@ -515,7 +515,7 @@ void i8254_timer_resume(void) } static struct sysdev_class timer_sysclass = { - set_kset_name("timer"), + set_kset_name("timer_pit"), .resume = timer_resume, }; Index: linux-2.6.14/include/linux/interrupt.h =================================================================== --- linux-2.6.14.orig/include/linux/interrupt.h +++ linux-2.6.14/include/linux/interrupt.h @@ -112,7 +112,10 @@ enum NET_TX_SOFTIRQ, NET_RX_SOFTIRQ, SCSI_SOFTIRQ, - TASKLET_SOFTIRQ + TASKLET_SOFTIRQ, +#ifdef CONFIG_HIGH_RES_TIMERS + KTIMER_SOFTIRQ, +#endif }; /* softirq mask and active fields moved to irq_cpustat_t in Index: linux-2.6.14/Makefile =================================================================== --- linux-2.6.14.orig/Makefile +++ linux-2.6.14/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 14 -EXTRAVERSION = +EXTRAVERSION =-kthrt1 NAME=Affluent Albatross # *DOCUMENTATION*