GIT 479a3a557eb26b93f0f267e45a5b56b0003fb85a git+ssh://master.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel.git#for-akpm 479a3a557eb26b93f0f267e45a5b56b0003fb85a commit 479a3a557eb26b93f0f267e45a5b56b0003fb85a Author: Peter Zijlstra Date: Sun Mar 2 20:11:11 2008 +0100 sched: work around hrtick related lockup Marcin Slusarz reported: > Since early 2.6.25 days I'm having strange lockup on boot. As it happens > rarely (in ~10% of boots), I couldn't bisect it. No kernel panic, SysRq > didn't work, so I couldn't provide any useful informations to LK community. > I hoped someone else would fix it... :) > > It's rc3 so I decided to narrow it down myself. I enabled netconsole > to see whether some other informations are printed before lockup. > It didn't help, but I noticed that lockup happens much more frequenly! (~50%) > So I bisected it down to: > > 8f4d37ec073c17e2d4aa8851df5837d798606d6f is first bad commit > commit 8f4d37ec073c17e2d4aa8851df5837d798606d6f > Author: Peter Zijlstra > Date: Fri Jan 25 21:08:29 2008 +0100 > > sched: high-res preemption tick > > Use HR-timers (when available) to deliver an accurate preemption tick. > > The regular scheduler tick that runs at 1/HZ can be too coarse when nice > level are used. The fairness system will still keep the cpu utilisation 'fair' > by then delaying the task that got an excessive amount of CPU time but try to > minimize this by delivering preemption points spot-on. > > The average frequency of this extra interrupt is sched_latency / nr_latency. > Which need not be higher than 1/HZ, its just that the distribution within the > sched_latency period is important. > > Signed-off-by: Peter Zijlstra > Signed-off-by: Ingo Molnar work around this problem by disabling the hrtick by default. Reported-by: Marcin Slusarz Bisected-by: Marcin Slusarz Signed-off-by: Ingo Molnar commit 6f3695865d005ddc319538f372c47245dac945d7 Author: Guillaume Chazarain Date: Tue Apr 1 21:27:26 2008 +0200 sched: fix rq->clock overflows detection with CONFIG_NO_HZ When using CONFIG_NO_HZ, rq->tick_timestamp is not updated every TICK_NSEC. We check that the number of skipped ticks matches the clock jump seen in __update_rq_clock(). Signed-off-by: Guillaume Chazarain Signed-off-by: Ingo Molnar commit f50c58a05fe7a553dd5cdce7e1cc58b75979a889 Author: Peter Zijlstra Date: Wed Feb 27 23:21:04 2008 +0100 sched: remove isolcpus cpu isolation doesn't offer anything over cpusets, hence remove it. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar commit 3de507b9613d2002e9a1023bb7d5c255d794125b Author: Steven Rostedt Date: Tue Apr 1 21:27:26 2008 +0200 ftrace: make the task state char-string visible to all The tracer wants to be able to convert the state number into a user visible character. This patch pulls that conversion string out the scheduler into the header. This way if it were to ever change, other parts of the kernel will know. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar commit f1709917969b3c3eb72db0cd9d3a424cdd20e4b2 Author: Ingo Molnar Date: Tue Apr 1 21:27:25 2008 +0200 sched: add latency tracer callbacks to the scheduler add 3 lightweight callbacks to the tracer backend. zero impact if tracing is turned off. Signed-off-by: Ingo Molnar commit 5eecf0ea9a44f35edc41521a9bf33e5b152e1ac4 Author: Dmitry Adamushko Date: Sun Feb 17 22:34:07 2008 +0100 latencytop: optimize LT_BACKTRACEDEPTH loops a bit There is no need to loop any longer when 'same == 0'. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar commit 079aee93cfbe700becef2922fc6025ba9ca0ec5a Author: Ingo Molnar Date: Fri Mar 14 16:09:59 2008 +0100 sched: remove sysctl_sched_batch_wakeup_granularity it's unused. Signed-off-by: Ingo Molnar commit b73daf7859e7c6439897bb542ac3933e414d2036 Author: Peter Zijlstra Date: Tue Apr 1 21:27:24 2008 +0200 sched: fix overflow in wakeup preemption condition 'pse->vruntime + gran' might be larger than u64 and wrap around and give a false positive, or 'se->vruntime' might just have wrapped and give a false negative. not a real issue in practice, 64-bit wraparound should be exceedingly rare and it can at most result in a missed wakeup preemption. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar commit 4448605457878ebf4c05d91e6d0f21a59b3e8086 Author: Ingo Molnar Date: Wed Mar 19 01:37:10 2008 +0100 sched: reenable sync wakeups Signed-off-by: Ingo Molnar commit f813538086c37b6a5966310711cd7e06bf1f4052 Author: Ingo Molnar Date: Mon Mar 17 09:36:53 2008 +0100 sched: cache hot buddy Signed-off-by: Ingo Molnar commit cb02219e7d5ca6f28dd2cbff647bc858074493cc Author: Ingo Molnar Date: Wed Mar 19 01:39:19 2008 +0100 sched: feat affine wakeups Signed-off-by: Ingo Molnar commit 4db74249fc7d4900817d296cf1da8a250c1b181a Author: Ingo Molnar Date: Sun Mar 16 20:03:22 2008 +0100 x86: patches/sched-feat-sync-wakeups.patch Signed-off-by: Ingo Molnar commit 21e0e13fe353f42fc13715598294db5f9a7edf2f Author: Ingo Molnar Date: Thu Feb 28 21:00:21 2008 +0100 sched: make cpu_clock() globally synchronous Alexey Zaytsev reported (and bisected) that the introduction of cpu_clock() in printk made the timestamps jump back and forth. Make cpu_clock() more reliable while still keeping it fast when it's called frequently. Signed-off-by: Ingo Molnar include/linux/sched.h | 29 ++++++++++- kernel/latencytop.c | 27 ++++++---- kernel/sched.c | 137 +++++++++++++++++++++++++++++++++++++------------ kernel/sched_debug.c | 1 - kernel/sched_fair.c | 12 +---- kernel/sysctl.c | 11 ---- 6 files changed, 150 insertions(+), 67 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6a1e7af..919ec45 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1551,7 +1551,6 @@ static inline void wake_up_idle_cpu(int cpu) { } extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_batch_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; @@ -2031,6 +2030,32 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) } #endif +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern void +ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next); +#else +static inline void +ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) +{ +} +#endif + +#ifdef CONFIG_SCHED_TRACER +extern void +ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr); +extern void +ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr); +#else +static inline void +ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +{ +} +static inline void +ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) +{ +} +#endif + extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); extern long sched_getaffinity(pid_t pid, cpumask_t *mask); @@ -2106,6 +2131,8 @@ static inline void migration_init(void) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +#define TASK_STATE_TO_CHAR_STR "RSDTtZX" + #endif /* __KERNEL__ */ #endif diff --git a/kernel/latencytop.c b/kernel/latencytop.c index b4e3c85..7c74dab 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -64,8 +64,8 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record return; for (i = 0; i < MAXLR; i++) { - int q; - int same = 1; + int q, same = 1; + /* Nothing stored: */ if (!latency_record[i].backtrace[0]) { if (firstnonnull > i) @@ -73,12 +73,15 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record continue; } for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { - if (latency_record[i].backtrace[q] != - lat->backtrace[q]) + unsigned long record = lat->backtrace[q]; + + if (latency_record[i].backtrace[q] != record) { same = 0; - if (same && lat->backtrace[q] == 0) break; - if (same && lat->backtrace[q] == ULONG_MAX) + } + + /* 0 and ULONG_MAX entries mean end of backtrace: */ + if (record == 0 || record == ULONG_MAX) break; } if (same) { @@ -143,14 +146,18 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) for (i = 0; i < LT_SAVECOUNT ; i++) { struct latency_record *mylat; int same = 1; + mylat = &tsk->latency_record[i]; for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { - if (mylat->backtrace[q] != - lat.backtrace[q]) + unsigned long record = lat.backtrace[q]; + + if (mylat->backtrace[q] != record) { same = 0; - if (same && lat.backtrace[q] == 0) break; - if (same && lat.backtrace[q] == ULONG_MAX) + } + + /* 0 and ULONG_MAX entries mean end of backtrace: */ + if (record == 0 || record == ULONG_MAX) break; } if (same) { diff --git a/kernel/sched.c b/kernel/sched.c index 8dcdec6..437d367 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -396,6 +396,7 @@ struct rq { unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned char idle_at_tick; #ifdef CONFIG_NO_HZ + unsigned long last_tick_seen; unsigned char in_nohz_recently; #endif /* capture load from *all* tasks on this cpu: */ @@ -499,6 +500,32 @@ static inline int cpu_of(struct rq *rq) #endif } +#ifdef CONFIG_NO_HZ +static inline bool nohz_on(int cpu) +{ + return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE; +} + +static inline u64 max_skipped_ticks(struct rq *rq) +{ + return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1; +} + +static inline void update_last_tick_seen(struct rq *rq) +{ + rq->last_tick_seen = jiffies; +} +#else +static inline u64 max_skipped_ticks(struct rq *rq) +{ + return 1; +} + +static inline void update_last_tick_seen(struct rq *rq) +{ +} +#endif + /* * Update the per-runqueue clock, as finegrained as the platform can give * us, but without assuming monotonicity, etc.: @@ -523,9 +550,12 @@ static void __update_rq_clock(struct rq *rq) /* * Catch too large forward jumps too: */ - if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { - if (clock < rq->tick_timestamp + TICK_NSEC) - clock = rq->tick_timestamp + TICK_NSEC; + u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC; + u64 max_time = rq->tick_timestamp + max_jump; + + if (unlikely(clock + delta > max_time)) { + if (clock < max_time) + clock = max_time; else clock++; rq->clock_overflows++; @@ -594,15 +624,21 @@ enum { SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, SCHED_FEAT_WAKEUP_PREEMPT = 2, SCHED_FEAT_START_DEBIT = 4, - SCHED_FEAT_HRTICK = 8, - SCHED_FEAT_DOUBLE_TICK = 16, + SCHED_FEAT_AFFINE_WAKEUPS = 8, + SCHED_FEAT_CACHE_HOT_BUDDY = 16, + SCHED_FEAT_SYNC_WAKEUPS = 32, + SCHED_FEAT_HRTICK = 64, + SCHED_FEAT_DOUBLE_TICK = 128, }; const_debug unsigned int sysctl_sched_features = SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | SCHED_FEAT_START_DEBIT * 1 | - SCHED_FEAT_HRTICK * 1 | + SCHED_FEAT_AFFINE_WAKEUPS * 1 | + SCHED_FEAT_CACHE_HOT_BUDDY * 1 | + SCHED_FEAT_SYNC_WAKEUPS * 1 | + SCHED_FEAT_HRTICK * 0 | SCHED_FEAT_DOUBLE_TICK * 0; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) @@ -632,11 +668,39 @@ int sysctl_sched_rt_runtime = 950000; */ #define RUNTIME_INF ((u64)~0ULL) +static const unsigned long long time_sync_thresh = 100000; + +static DEFINE_PER_CPU(unsigned long long, time_offset); +static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); + /* - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu - * clock constructed from sched_clock(): + * Global lock which we take every now and then to synchronize + * the CPUs time. This method is not warp-safe, but it's good + * enough to synchronize slowly diverging time sources and thus + * it's good enough for tracing: */ -unsigned long long cpu_clock(int cpu) +static DEFINE_SPINLOCK(time_sync_lock); +static unsigned long long prev_global_time; + +static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) +{ + unsigned long flags; + + spin_lock_irqsave(&time_sync_lock, flags); + + if (time < prev_global_time) { + per_cpu(time_offset, cpu) += prev_global_time - time; + time = prev_global_time; + } else { + prev_global_time = time; + } + + spin_unlock_irqrestore(&time_sync_lock, flags); + + return time; +} + +static unsigned long long __cpu_clock(int cpu) { unsigned long long now; unsigned long flags; @@ -657,6 +721,24 @@ unsigned long long cpu_clock(int cpu) return now; } + +/* + * For kernel-internal use: high-speed (but slightly incorrect) per-cpu + * clock constructed from sched_clock(): + */ +unsigned long long cpu_clock(int cpu) +{ + unsigned long long prev_cpu_time, time, delta_time; + + prev_cpu_time = per_cpu(prev_cpu_time, cpu); + time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); + delta_time = time-prev_cpu_time; + + if (unlikely(delta_time > time_sync_thresh)) + time = __sync_cpu_clock(time, cpu); + + return time; +} EXPORT_SYMBOL_GPL(cpu_clock); #ifndef prepare_arch_switch @@ -1438,7 +1520,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (&p->se == cfs_rq_of(&p->se)->next) + if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) return 1; if (p->sched_class != &fair_sched_class) @@ -1839,6 +1921,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) long old_state; struct rq *rq; + if (!sched_feat(SYNC_WAKEUPS)) + sync = 0; + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; @@ -1889,6 +1974,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ + ftrace_wake_up_task(p, rq->curr); schedstat_inc(p, se.nr_wakeups); if (sync) schedstat_inc(p, se.nr_wakeups_sync); @@ -2032,6 +2118,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(p, rq); } + ftrace_wake_up_new_task(p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2204,6 +2291,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); + ftrace_ctx_switch(prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -3765,6 +3853,7 @@ void scheduler_tick(void) rq->clock_underflows++; } rq->tick_timestamp = rq->clock; + update_last_tick_seen(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); update_sched_rt_period(rq); @@ -5166,7 +5255,7 @@ out_unlock: return retval; } -static const char stat_nam[] = "RSDTtZX"; +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { @@ -5309,7 +5398,6 @@ static inline void sched_init_granularity(void) sysctl_sched_latency = limit; sysctl_sched_wakeup_granularity *= factor; - sysctl_sched_batch_wakeup_granularity *= factor; } #ifdef CONFIG_SMP @@ -6224,24 +6312,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) rcu_assign_pointer(rq->sd, sd); } -/* cpus with isolated domains */ -static cpumask_t cpu_isolated_map = CPU_MASK_NONE; - -/* Setup the mask of cpus configured for isolated domains */ -static int __init isolated_cpu_setup(char *str) -{ - int ints[NR_CPUS], i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - cpus_clear(cpu_isolated_map); - for (i = 1; i <= ints[0]; i++) - if (ints[i] < NR_CPUS) - cpu_set(ints[i], cpu_isolated_map); - return 1; -} - -__setup("isolcpus=", isolated_cpu_setup); - /* * init_sched_build_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU @@ -6868,7 +6938,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) doms_cur = &fallback_doms; - cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + *doms_cur = *cpu_map; err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); @@ -6929,7 +6999,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) if (doms_new == NULL) { ndoms_new = 1; doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + doms_new[0] = cpu_online_map; } /* Destroy deleted domains */ @@ -7088,7 +7158,7 @@ void __init sched_init_smp(void) get_online_cpus(); arch_init_sched_domains(&cpu_online_map); - cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); + non_isolated_cpus = cpu_possible_map; if (cpus_empty(non_isolated_cpus)) cpu_set(smp_processor_id(), non_isolated_cpus); put_online_cpus(); @@ -7214,6 +7284,7 @@ void __init sched_init(void) lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; rq->clock = 1; + update_last_tick_seen(rq); init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index ef358ba..3d09106 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -214,7 +214,6 @@ static int sched_debug_show(struct seq_file *m, void *v) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); - PN(sysctl_sched_batch_wakeup_granularity); PN(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 86a9337..33101e5 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -62,16 +62,6 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1; unsigned int __read_mostly sysctl_sched_compat_yield; /* - * SCHED_BATCH wake-up granularity. - * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) - * - * This option delays the preemption effects of decoupled workloads - * and reduces their over-scheduling. Synchronous workloads will still - * have immediate wakeup/sleep latencies. - */ -unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; - -/* * SCHED_OTHER wake-up granularity. * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * @@ -1148,7 +1138,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (unlikely(se->load.weight > NICE_0_LOAD)) gran = calc_delta_fair(gran, &se->load); - if (pse->vruntime + gran < se->vruntime) + if ((s64)(se->vruntime - pse->vruntime) > (s64)gran) resched_task(curr); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2a2d68..be332e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -270,17 +270,6 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_batch_wakeup_granularity_ns", - .data = &sysctl_sched_batch_wakeup_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, .maxlen = sizeof(unsigned int),