From: Ingo Molnar Add framework to boost/unboost the priority of RT tasks. This consists of: - caching the 'normal' priority in ->normal_prio - providing a functions to set/get the priority of the task - make sched_setscheduler() aware of boosting Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton --- include/linux/init_task.h | 1 include/linux/sched.h | 19 ++++- kernel/sched.c | 136 +++++++++++++++++++++++++++++++----- 3 files changed, 138 insertions(+), 18 deletions(-) diff -puN include/linux/init_task.h~pi-futex-scheduler-support-for-pi include/linux/init_task.h --- devel/include/linux/init_task.h~pi-futex-scheduler-support-for-pi 2006-05-11 15:19:28.000000000 -0700 +++ devel-akpm/include/linux/init_task.h 2006-05-11 15:19:28.000000000 -0700 @@ -87,6 +87,7 @@ extern struct group_info init_groups; .lock_depth = -1, \ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ + .normal_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ diff -puN include/linux/sched.h~pi-futex-scheduler-support-for-pi include/linux/sched.h --- devel/include/linux/sched.h~pi-futex-scheduler-support-for-pi 2006-05-11 15:19:28.000000000 -0700 +++ devel-akpm/include/linux/sched.h 2006-05-11 15:19:28.000000000 -0700 @@ -484,7 +484,8 @@ struct signal_struct { #define MAX_PRIO (MAX_RT_PRIO + 40) -#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) +#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) +#define rt_task(p) rt_prio((p)->prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) /* @@ -769,7 +770,7 @@ struct task_struct { #endif #endif int load_weight; /* for niceness load balancing purposes */ - int prio, static_prio; + int prio, static_prio, normal_prio; struct list_head run_list; prio_array_t *array; @@ -898,6 +899,9 @@ struct task_struct { /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ spinlock_t proc_lock; + /* Protection of the PI data structures: */ + spinlock_t pi_lock; + #ifdef CONFIG_DEBUG_MUTEXES /* mutex deadlock detection */ struct mutex_waiter *blocked_on; @@ -1069,6 +1073,17 @@ static inline void idle_task_exit(void) #endif extern void sched_idle_next(void); + +#ifdef CONFIG_RT_MUTEXES +extern int rt_mutex_getprio(task_t *p); +extern void rt_mutex_setprio(task_t *p, int prio); +#else +static inline int rt_mutex_getprio(task_t *p) +{ + return p->normal_prio; +} +#endif + extern void set_user_nice(task_t *p, long nice); extern int task_prio(const task_t *p); extern int task_nice(const task_t *p); diff -puN kernel/sched.c~pi-futex-scheduler-support-for-pi kernel/sched.c --- devel/kernel/sched.c~pi-futex-scheduler-support-for-pi 2006-05-11 15:19:28.000000000 -0700 +++ devel-akpm/kernel/sched.c 2006-05-11 15:19:28.000000000 -0700 @@ -668,7 +668,7 @@ static inline void enqueue_task_head(str } /* - * effective_prio - return the priority that is based on the static + * __normal_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. * * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] @@ -681,13 +681,11 @@ static inline void enqueue_task_head(str * * Both properties are important to certain workloads. */ -static int effective_prio(task_t *p) + +static inline int __normal_prio(task_t *p) { int bonus, prio; - if (rt_task(p)) - return p->prio; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; @@ -803,6 +801,44 @@ static inline void dec_nr_running(task_t } /* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(task_t *p) +{ + int prio; + + if (p->policy != SCHED_NORMAL && p->policy != SCHED_BATCH) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(task_t *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +/* * __activate_task - move a task to the runqueue. */ static void __activate_task(task_t *p, runqueue_t *rq) @@ -824,6 +860,10 @@ static inline void __activate_idle_task( inc_nr_running(p, rq); } +/* + * Recalculate p->normal_prio and p->prio after having slept, + * updating the sleep-average too: + */ static int recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ @@ -1587,6 +1627,7 @@ void fastcall wake_up_new_task(task_t *p __activate_task(p, rq); else { p->prio = current->prio; + p->normal_prio = current->normal_prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; @@ -3699,6 +3740,59 @@ long fastcall __sched sleep_on_timeout(w EXPORT_SYMBOL(sleep_on_timeout); +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(task_t *p, int prio) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int oldprio; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; + array = p->array; + if (array) + dequeue_task(p, array); + p->prio = prio; + + if (array) { + /* + * If changing to an RT priority then queue it + * in the active array! + */ + if (rt_task(p)) + array = rq->active; + enqueue_task(p, array); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); +} + +#endif + void set_user_nice(task_t *p, long nice) { unsigned long flags; @@ -3864,16 +3958,16 @@ static void __setscheduler(struct task_s BUG_ON(p->array); p->policy = policy; p->rt_priority = prio; - if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { - p->prio = MAX_RT_PRIO-1 - p->rt_priority; - } else { - p->prio = p->static_prio; - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; - } + + p->normal_prio = normal_prio(p); + /* we are holding p->pi_list already */ + p->prio = rt_mutex_getprio(p); + /* + * SCHED_BATCH tasks are treated as perpetual CPU hogs: + */ + if (policy == SCHED_BATCH) + p->sleep_avg = 0; + set_load_weight(p); } @@ -3941,6 +4035,11 @@ recheck: if (retval) return retval; /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + */ + spin_lock(&p->pi_lock); + /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. */ @@ -3949,6 +4048,7 @@ recheck: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, &flags); + spin_unlock(&p->pi_lock); goto recheck; } array = p->array; @@ -3970,6 +4070,8 @@ recheck: resched_task(rq->curr); } task_rq_unlock(rq, &flags); + spin_unlock(&p->pi_lock); + return 0; } EXPORT_SYMBOL_GPL(sched_setscheduler); @@ -4601,7 +4703,7 @@ void __devinit init_idle(task_t *idle, i idle->timestamp = sched_clock(); idle->sleep_avg = 0; idle->array = NULL; - idle->prio = MAX_PRIO; + idle->prio = idle->normal_prio = MAX_PRIO; idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); @@ -6601,6 +6703,7 @@ void normalize_rt_tasks(void) if (!rt_task(p)) continue; + spin_lock(&p->pi_lock); rq = task_rq_lock(p, &flags); array = p->array; @@ -6613,6 +6716,7 @@ void normalize_rt_tasks(void) } task_rq_unlock(rq, &flags); + spin_unlock(&p->pi_lock); } read_unlock_irq(&tasklist_lock); } _