GIT 9f508f8258e18e9333f18daf1f0860df48d49ed2 git+ssh://master.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched.git commit Author: Ingo Molnar Date: Tue Aug 28 12:53:24 2007 +0200 sched: clean up task_new_fair() cleanup: we have the 'se' and 'curr' entity-pointers already, no need to use p->se and current->se. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith commit 213c8af67f21c1dc0d50940b159d9521c95f3c89 Author: Ingo Molnar Date: Tue Aug 28 12:53:24 2007 +0200 sched: small schedstat fix small schedstat fix: the cfs_rq->wait_runtime 'sum of all runtimes' statistics counters missed newly forked tasks and thus had a constant negative skew. Fix this. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith commit b77d69db9f4ba03b2ed17e383c2d73ca89f5ab14 Author: Ingo Molnar Date: Tue Aug 28 12:53:24 2007 +0200 sched: fix wait_start_fair condition in update_stats_wait_end() Peter Zijlstra noticed the following bug in SCHED_FEAT_SKIP_INITIAL (which is disabled by default at the moment): it relies on se.wait_start_fair being 0 while update_stats_wait_end() did not recognize a 0 value, so instead of 'skipping' the initial interval we gave the new child a maximum boost of +runtime-limit ... (No impact on the default kernel, but nice to fix for completeness.) Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith commit 7109c4429af3640f79a638f177fc5d05b9807149 Author: Ting Yang Date: Tue Aug 28 12:53:24 2007 +0200 sched: call update_curr() in task_tick_fair() update the fair-clock before using it for the key value. [ mingo@elte.hu: small cleanups. ] Signed-off-by: Ting Yang Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra commit f6cf891c4d7128f9f91243fc0b9ce99e10fa1586 Author: Ingo Molnar Date: Tue Aug 28 12:53:24 2007 +0200 sched: make the scheduler converge to the ideal latency de-HZ-ification of the granularity defaults unearthed a pre-existing property of CFS: while it correctly converges to the granularity goal, it does not prevent run-time fluctuations in the range of [-gran ... 0 ... +gran]. With the increase of the granularity due to the removal of HZ dependencies, this becomes visible in chew-max output (with 5 tasks running): out: 28 . 27. 32 | flu: 0 . 0 | ran: 9 . 13 | per: 37 . 40 out: 27 . 27. 32 | flu: 0 . 0 | ran: 17 . 13 | per: 44 . 40 out: 27 . 27. 32 | flu: 0 . 0 | ran: 9 . 13 | per: 36 . 40 out: 29 . 27. 32 | flu: 2 . 0 | ran: 17 . 13 | per: 46 . 40 out: 28 . 27. 32 | flu: 0 . 0 | ran: 9 . 13 | per: 37 . 40 out: 29 . 27. 32 | flu: 0 . 0 | ran: 18 . 13 | per: 47 . 40 out: 28 . 27. 32 | flu: 0 . 0 | ran: 9 . 13 | per: 37 . 40 average slice is the ideal 13 msecs and the period is picture-perfect 40 msecs. But the 'ran' field fluctuates around 13.33 msecs and there's no mechanism in CFS to keep that from happening: it's a perfectly valid solution that CFS finds. to fix this we add a granularity/preemption rule that knows about the "target latency", which makes tasks that run longer than the ideal latency run a bit less. The simplest approach is to simply decrease the preemption granularity when a task overruns its ideal latency. For this we have to track how much the task executed since its last preemption. ( this adds a new field to task_struct, but we can eliminate that overhead in 2.6.24 by putting all the scheduler timestamps into an anonymous union. ) with this change in place, chew-max output is fluctuation-less all around: out: 28 . 27. 39 | flu: 0 . 2 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 2 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 2 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 2 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 1 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 1 | ran: 13 . 13 | per: 41 . 40 this patch has no impact on any fastpath or on any globally observable scheduling property. (unless you have sharp enough eyes to see millisecond-level ruckles in glxgears smoothness :-) Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith commit 5f01d519e60a6ca1a7d9be9f2d73c5f521383992 Author: Mike Galbraith Date: Tue Aug 28 12:53:24 2007 +0200 sched: fix sleeper bonus limit There is an Amarok song switch time increase (regression) under hefty load. What is happening is that sleeper_bonus is never consumed, and only rarely goes below runtime_limit, so for the most part, Amarok isn't getting any bonus at all. We're keeping sleeper_bonus right at runtime_limit (sched_latency == sched_runtime_limit == 40ms) forever, ie we don't consume if we're lower that that, and don't add if we're above it. One Amarok thread waking (or anybody else) will push us past the threshold, so the next thread waking gets nada, but will reap pain from the previous thread waking until we drop back to runtime_limit. It looks to me like under load, some random task gets a bonus, and everybody else pays, whether deserving or not. This diff fixed the regression for me at any load rate. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra include/linux/sched.h | 1 + kernel/sched.c | 1 + kernel/sched_fair.c | 46 +++++++++++++++++++++++++++++++++++----------- 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index bd6a032..f4e324e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -904,6 +904,7 @@ struct sched_entity { u64 exec_start; u64 sum_exec_runtime; + u64 prev_sum_exec_runtime; u64 wait_start_fair; u64 sleep_start_fair; diff --git a/kernel/sched.c b/kernel/sched.c index 9fe473a..b533d6d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1587,6 +1587,7 @@ static void __sched_fork(struct task_str p->se.wait_start_fair = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; p->se.delta_exec = 0; p->se.delta_fair_run = 0; p->se.delta_fair_sleep = 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ee37718..ce39282 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -354,7 +354,7 @@ __update_curr(struct cfs_rq *cfs_rq, str delta_fair = calc_delta_fair(delta_exec, lw); delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - if (cfs_rq->sleeper_bonus > sysctl_sched_latency) { + if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); delta = min(delta, (unsigned long)( (long)sysctl_sched_runtime_limit - curr->wait_runtime)); @@ -489,6 +489,9 @@ update_stats_wait_end(struct cfs_rq *cfs { unsigned long delta_fair; + if (unlikely(!se->wait_start_fair)) + return; + delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), (u64)(cfs_rq->fair_clock - se->wait_start_fair)); @@ -668,7 +671,7 @@ #endif /* * Preempt the current task with a newly woken task if needed: */ -static void +static int __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, struct sched_entity *curr, unsigned long granularity) { @@ -679,8 +682,11 @@ __check_preempt_curr_fair(struct cfs_rq * preempt the current task unless the best task has * a larger than sched_granularity fairness advantage: */ - if (__delta > niced_granularity(curr, granularity)) + if (__delta > niced_granularity(curr, granularity)) { resched_task(rq_of(cfs_rq)->curr); + return 1; + } + return 0; } static inline void @@ -725,6 +731,7 @@ static void put_prev_entity(struct cfs_r static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { + unsigned long gran, ideal_runtime, delta_exec; struct sched_entity *next; /* @@ -741,8 +748,22 @@ static void entity_tick(struct cfs_rq *c if (next == curr) return; - __check_preempt_curr_fair(cfs_rq, next, curr, - sched_granularity(cfs_rq)); + gran = sched_granularity(cfs_rq); + ideal_runtime = niced_granularity(curr, + max(sysctl_sched_latency / cfs_rq->nr_running, + (unsigned long)sysctl_sched_min_granularity)); + /* + * If we executed more than what the latency constraint suggests, + * reduce the rescheduling granularity. This way the total latency + * of how much a task is not scheduled converges to + * sysctl_sched_latency: + */ + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) + gran = 0; + + if (__check_preempt_curr_fair(cfs_rq, next, curr, gran)) + curr->prev_sum_exec_runtime = curr->sum_exec_runtime; } /************************************************** @@ -1076,31 +1097,34 @@ static void task_tick_fair(struct rq *rq static void task_new_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); - struct sched_entity *se = &p->se; + struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); sched_info_queued(p); + update_curr(cfs_rq); update_stats_enqueue(cfs_rq, se); /* * Child runs first: we let it run before the parent * until it reschedules once. We set up the key so that * it will preempt the parent: */ - p->se.fair_key = current->se.fair_key - - niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1; + se->fair_key = curr->fair_key - + niced_granularity(curr, sched_granularity(cfs_rq)) - 1; /* * The first wait is dominated by the child-runs-first logic, * so do not credit it with that waiting time yet: */ if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) - p->se.wait_start_fair = 0; + se->wait_start_fair = 0; /* * The statistical average of wait_runtime is about * -granularity/2, so initialize the task with that: */ - if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) - p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2); + if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) { + se->wait_runtime = -(sched_granularity(cfs_rq) / 2); + schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); + } __enqueue_entity(cfs_rq, se); }