From: Ingo Molnar Add framework to boost/unboost the priority of RT tasks. This consists of: - caching the 'normal' priority in ->normal_prio - providing a functions to set/get the priority of the task - make sched_setscheduler() aware of boosting has_rt_policy() fix: Peter Williams Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton --- include/linux/init_task.h | 2 include/linux/sched.h | 21 +++- kernel/sched.c | 189 ++++++++++++++++++++++++++++++------ 3 files changed, 181 insertions(+), 31 deletions(-) diff -puN include/linux/init_task.h~pi-futex-scheduler-support-for-pi include/linux/init_task.h --- devel/include/linux/init_task.h~pi-futex-scheduler-support-for-pi 2006-05-25 08:01:28.000000000 -0700 +++ devel-akpm/include/linux/init_task.h 2006-05-25 08:01:28.000000000 -0700 @@ -87,6 +87,7 @@ extern struct group_info init_groups; .lock_depth = -1, \ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ + .normal_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ @@ -123,6 +124,7 @@ extern struct group_info init_groups; .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ + .pi_lock = SPIN_LOCK_UNLOCKED, \ } diff -puN include/linux/sched.h~pi-futex-scheduler-support-for-pi include/linux/sched.h --- devel/include/linux/sched.h~pi-futex-scheduler-support-for-pi 2006-05-25 08:01:28.000000000 -0700 +++ devel-akpm/include/linux/sched.h 2006-05-25 08:01:28.000000000 -0700 @@ -484,8 +484,11 @@ struct signal_struct { #define MAX_PRIO (MAX_RT_PRIO + 40) -#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) +#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) +#define rt_task(p) rt_prio((p)->prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) +#define has_rt_policy(p) \ + unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH) /* * Some day this will be a full-fledged user tracking system.. @@ -769,7 +772,7 @@ struct task_struct { #endif #endif int load_weight; /* for niceness load balancing purposes */ - int prio, static_prio; + int prio, static_prio, normal_prio; struct list_head run_list; prio_array_t *array; @@ -898,6 +901,9 @@ struct task_struct { /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ spinlock_t proc_lock; + /* Protection of the PI data structures: */ + spinlock_t pi_lock; + #ifdef CONFIG_DEBUG_MUTEXES /* mutex deadlock detection */ struct mutex_waiter *blocked_on; @@ -1069,6 +1075,17 @@ static inline void idle_task_exit(void) #endif extern void sched_idle_next(void); + +#ifdef CONFIG_RT_MUTEXES +extern int rt_mutex_getprio(task_t *p); +extern void rt_mutex_setprio(task_t *p, int prio); +#else +static inline int rt_mutex_getprio(task_t *p) +{ + return p->normal_prio; +} +#endif + extern void set_user_nice(task_t *p, long nice); extern int task_prio(const task_t *p); extern int task_nice(const task_t *p); diff -puN kernel/sched.c~pi-futex-scheduler-support-for-pi kernel/sched.c --- devel/kernel/sched.c~pi-futex-scheduler-support-for-pi 2006-05-25 08:01:28.000000000 -0700 +++ devel-akpm/kernel/sched.c 2006-05-25 08:01:28.000000000 -0700 @@ -358,6 +358,25 @@ static inline void finish_lock_switch(ru #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* + * __task_rq_lock - lock the runqueue a given task resides on. + * Must be called interrupts disabled. + */ +static inline runqueue_t *__task_rq_lock(task_t *p) + __acquires(rq->lock) +{ + struct runqueue *rq; + +repeat_lock_task: + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock(&rq->lock); + goto repeat_lock_task; + } + return rq; +} + +/* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. @@ -378,6 +397,12 @@ repeat_lock_task: return rq; } +static inline void __task_rq_unlock(runqueue_t *rq) + __releases(rq->lock) +{ + spin_unlock(&rq->lock); +} + static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) __releases(rq->lock) { @@ -666,7 +691,7 @@ static inline void enqueue_task_head(str } /* - * effective_prio - return the priority that is based on the static + * __normal_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. * * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] @@ -679,13 +704,11 @@ static inline void enqueue_task_head(str * * Both properties are important to certain workloads. */ -static int effective_prio(task_t *p) + +static inline int __normal_prio(task_t *p) { int bonus, prio; - if (rt_task(p)) - return p->prio; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; @@ -720,7 +743,7 @@ static int effective_prio(task_t *p) static void set_load_weight(task_t *p) { - if (rt_task(p)) { + if (has_rt_policy(p)) { #ifdef CONFIG_SMP if (p == task_rq(p)->migration_thread) /* @@ -759,6 +782,44 @@ static inline void dec_nr_running(task_t } /* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(task_t *p) +{ + int prio; + + if (has_rt_policy(p)) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(task_t *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +/* * __activate_task - move a task to the runqueue. */ static void __activate_task(task_t *p, runqueue_t *rq) @@ -780,6 +841,10 @@ static inline void __activate_idle_task( inc_nr_running(p, rq); } +/* + * Recalculate p->normal_prio and p->prio after having slept, + * updating the sleep-average too: + */ static int recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ @@ -1465,6 +1530,12 @@ void fastcall sched_fork(task_t *p, int * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; + + /* + * Make sure we do not leak PI boosting priority to the child: + */ + p->prio = current->normal_prio; + INIT_LIST_HEAD(&p->run_list); p->array = NULL; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -1545,6 +1616,7 @@ void fastcall wake_up_new_task(task_t *p __activate_task(p, rq); else { p->prio = current->prio; + p->normal_prio = current->normal_prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; @@ -3674,12 +3746,65 @@ long fastcall __sched sleep_on_timeout(w EXPORT_SYMBOL(sleep_on_timeout); +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(task_t *p, int prio) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int oldprio; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; + array = p->array; + if (array) + dequeue_task(p, array); + p->prio = prio; + + if (array) { + /* + * If changing to an RT priority then queue it + * in the active array! + */ + if (rt_task(p)) + array = rq->active; + enqueue_task(p, array); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); +} + +#endif + void set_user_nice(task_t *p, long nice) { unsigned long flags; prio_array_t *array; runqueue_t *rq; - int old_prio, new_prio, delta; + int old_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3694,7 +3819,7 @@ void set_user_nice(task_t *p, long nice) * it wont have any effect on scheduling until the task is * not SCHED_NORMAL/SCHED_BATCH: */ - if (rt_task(p)) { + if (has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } @@ -3704,12 +3829,11 @@ void set_user_nice(task_t *p, long nice) dec_raw_weighted_load(rq, p); } - old_prio = p->prio; - new_prio = NICE_TO_PRIO(nice); - delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); - p->prio += delta; + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; if (array) { enqueue_task(p, array); @@ -3724,7 +3848,6 @@ void set_user_nice(task_t *p, long nice) out_unlock: task_rq_unlock(rq, &flags); } - EXPORT_SYMBOL(set_user_nice); /* @@ -3839,16 +3962,14 @@ static void __setscheduler(struct task_s BUG_ON(p->array); p->policy = policy; p->rt_priority = prio; - if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { - p->prio = MAX_RT_PRIO-1 - p->rt_priority; - } else { - p->prio = p->static_prio; - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; - } + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); + /* + * SCHED_BATCH tasks are treated as perpetual CPU hogs: + */ + if (policy == SCHED_BATCH) + p->sleep_avg = 0; set_load_weight(p); } @@ -3916,14 +4037,20 @@ recheck: if (retval) return retval; /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + */ + spin_lock_irqsave(&p->pi_lock, flags); + /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. */ - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } array = p->array; @@ -3944,7 +4071,9 @@ recheck: } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + return 0; } EXPORT_SYMBOL_GPL(sched_setscheduler); @@ -4576,7 +4705,7 @@ void __devinit init_idle(task_t *idle, i idle->timestamp = sched_clock(); idle->sleep_avg = 0; idle->array = NULL; - idle->prio = MAX_PRIO; + idle->prio = idle->normal_prio = MAX_PRIO; idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); @@ -6576,7 +6705,8 @@ void normalize_rt_tasks(void) if (!rt_task(p)) continue; - rq = task_rq_lock(p, &flags); + spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); array = p->array; if (array) @@ -6587,7 +6717,8 @@ void normalize_rt_tasks(void) resched_task(rq->curr); } - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); } read_unlock_irq(&tasklist_lock); } _