From: Ingo Molnar this is version -V3 of the PI-futex patchset (ontop of current -mm plus the -V2 delta patch). Changes since -V2: - do the -> set_current_state() cleanups for real - make plist_add() more readable - whitespace cleanups in kernel/futex.c - further cleanups in rtmutex_common.h - fix cornercase race in the PI-locking code (it triggered in the -rt tree) Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton --- kernel/rtmutex.c | 245 ++++++++++++++++++++------------------ kernel/rtmutex_common.h | 6 lib/plist.c | 3 futex.c | 0 4 files changed, 134 insertions(+), 120 deletions(-) diff -puN kernel/futex.c~pi-futex-v3 kernel/futex.c diff -puN kernel/rtmutex.c~pi-futex-v3 kernel/rtmutex.c --- devel/kernel/rtmutex.c~pi-futex-v3 2006-03-26 15:35:35.000000000 -0800 +++ devel-akpm/kernel/rtmutex.c 2006-03-26 15:35:35.000000000 -0800 @@ -173,11 +173,12 @@ static int lock_pi_chain(struct rt_mutex struct task_struct *owner; struct rt_mutex *nextlock, *lock = act_lock; struct rt_mutex_waiter *nextwaiter; + int deadlock_detect; /* * Debugging might turn deadlock detection on, unconditionally: */ - detect_deadlock = debug_rt_mutex_detect_deadlock(detect_deadlock); + deadlock_detect = debug_rt_mutex_detect_deadlock(detect_deadlock); for (;;) { owner = rt_mutex_owner(lock); @@ -185,7 +186,7 @@ static int lock_pi_chain(struct rt_mutex /* Check for circular dependencies */ if (unlikely(owner->pi_locked_by == current)) { debug_rt_mutex_deadlock(detect_deadlock, waiter, lock); - return detect_deadlock ? -EDEADLK : 0; + return detect_deadlock ? -EDEADLK : 1; } while (!spin_trylock(&owner->pi_lock)) { @@ -211,7 +212,7 @@ static int lock_pi_chain(struct rt_mutex /* End of chain? */ if (!nextwaiter) - return 0; + return 1; nextlock = nextwaiter->lock; @@ -223,7 +224,7 @@ static int lock_pi_chain(struct rt_mutex list_del_init(&owner->pi_lock_chain); owner->pi_locked_by = NULL; spin_unlock(&owner->pi_lock); - return detect_deadlock ? -EDEADLK : 0; + return detect_deadlock ? -EDEADLK : 1; } /* Try to get nextlock->wait_lock: */ @@ -242,7 +243,7 @@ static int lock_pi_chain(struct rt_mutex * for userspace locks), we have to walk the full chain * unconditionally. */ - if (detect_deadlock) + if (deadlock_detect) continue; /* @@ -255,11 +256,11 @@ static int lock_pi_chain(struct rt_mutex /* If the top waiter has higher priority, stop: */ if (rt_mutex_top_waiter(lock)->list_entry.prio <= waiter->list_entry.prio) - return 0; + return 1; } else { /* If nextwaiter is not the top waiter, stop: */ if (rt_mutex_top_waiter(lock) != nextwaiter) - return 0; + return 1; } } } @@ -340,103 +341,6 @@ static void adjust_pi_chain(struct rt_mu } /* - * Task blocks on lock. - * - * Prepare waiter and potentially propagate our priority into the pi chain. - * - * This must be called with lock->wait_lock held. - */ -static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - int detect_deadlock __IP_DECL__) -{ - int res = 0; - struct rt_mutex_waiter *top_waiter = waiter; - LIST_HEAD(lock_chain); - - waiter->task = current; - waiter->lock = lock; - debug_rt_mutex_reset_waiter(waiter); - - spin_lock(¤t->pi_lock); - current->pi_locked_by = current; - plist_node_init(&waiter->list_entry, current->prio); - plist_node_init(&waiter->pi_list_entry, current->prio); - - /* Get the top priority waiter of the lock: */ - if (rt_mutex_has_waiters(lock)) - top_waiter = rt_mutex_top_waiter(lock); - plist_add(&waiter->list_entry, &lock->wait_list); - - current->pi_blocked_on = waiter; - - /* - * Call adjust_prio_chain, when waiter is the new top waiter - * or when deadlock detection is requested: - */ - if (waiter != rt_mutex_top_waiter(lock) && - !debug_rt_mutex_detect_deadlock(detect_deadlock)) - goto out; - - /* Try to lock the full chain: */ - res = lock_pi_chain(lock, waiter, &lock_chain, 1, detect_deadlock); - - if (likely(!res)) - adjust_pi_chain(lock, waiter, top_waiter, &lock_chain); - - /* Common case: we managed to lock it: */ - if (res != -EBUSY) - goto out_unlock; - - /* Rare case: we hit some other task running a pi chain operation: */ - unlock_pi_chain(&lock_chain); - - plist_del(&waiter->list_entry, &lock->wait_list); - current->pi_blocked_on = NULL; - current->pi_locked_by = NULL; - spin_unlock(¤t->pi_lock); - fixup_rt_mutex_waiters(lock); - - spin_unlock(&lock->wait_lock); - - spin_lock(&pi_conflicts_lock); - - spin_lock(¤t->pi_lock); - current->pi_locked_by = current; - spin_lock(&lock->wait_lock); - if (!rt_mutex_owner(lock)) { - waiter->task = NULL; - spin_unlock(&pi_conflicts_lock); - goto out; - } - plist_node_init(&waiter->list_entry, current->prio); - plist_node_init(&waiter->pi_list_entry, current->prio); - - /* Get the top priority waiter of the lock: */ - if (rt_mutex_has_waiters(lock)) - top_waiter = rt_mutex_top_waiter(lock); - plist_add(&waiter->list_entry, &lock->wait_list); - - current->pi_blocked_on = waiter; - - /* Lock the full chain: */ - res = lock_pi_chain(lock, waiter, &lock_chain, 0, detect_deadlock); - - /* Drop the conflicts lock before adjusting: */ - spin_unlock(&pi_conflicts_lock); - - if (likely(!res)) - adjust_pi_chain(lock, waiter, top_waiter, &lock_chain); - - out_unlock: - unlock_pi_chain(&lock_chain); - out: - current->pi_locked_by = NULL; - spin_unlock(¤t->pi_lock); - return res; -} - -/* * Optimization: check if we can steal the lock from the * assigned pending owner [which might not have taken the * lock yet]: @@ -542,6 +446,117 @@ static int try_to_take_rt_mutex(struct r } /* + * Task blocks on lock. + * + * Prepare waiter and potentially propagate our priority into the pi chain. + * + * This must be called with lock->wait_lock held. + * return values: 1: waiter queued, 0: got the lock, + * -EDEADLK: deadlock detected. + */ +static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + int detect_deadlock __IP_DECL__) +{ + struct rt_mutex_waiter *top_waiter = waiter; + LIST_HEAD(lock_chain); + int res = 1; + + waiter->task = current; + waiter->lock = lock; + debug_rt_mutex_reset_waiter(waiter); + + spin_lock(¤t->pi_lock); + current->pi_locked_by = current; + plist_node_init(&waiter->list_entry, current->prio); + plist_node_init(&waiter->pi_list_entry, current->prio); + + /* Get the top priority waiter of the lock: */ + if (rt_mutex_has_waiters(lock)) + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&waiter->list_entry, &lock->wait_list); + + current->pi_blocked_on = waiter; + + /* + * Call adjust_prio_chain, when waiter is the new top waiter + * or when deadlock detection is requested: + */ + if (waiter != rt_mutex_top_waiter(lock) && + !debug_rt_mutex_detect_deadlock(detect_deadlock)) + goto out_unlock_pi; + + /* Try to lock the full chain: */ + res = lock_pi_chain(lock, waiter, &lock_chain, 1, detect_deadlock); + + if (likely(res == 1)) + adjust_pi_chain(lock, waiter, top_waiter, &lock_chain); + + /* Common case: we managed to lock it: */ + if (res != -EBUSY) + goto out_unlock_chain_pi; + + /* Rare case: we hit some other task running a pi chain operation: */ + unlock_pi_chain(&lock_chain); + + plist_del(&waiter->list_entry, &lock->wait_list); + current->pi_blocked_on = NULL; + current->pi_locked_by = NULL; + spin_unlock(¤t->pi_lock); + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* + * Here we have dropped all locks, and take the global + * pi_conflicts_lock. We have to redo all the work, no + * previous information about the lock is valid anymore: + */ + spin_lock(&pi_conflicts_lock); + + spin_lock(&lock->wait_lock); + if (try_to_take_rt_mutex(lock __IP__)) { + /* + * Rare race: against all odds we got the lock. + */ + res = 0; + goto out; + } + + WARN_ON(!rt_mutex_owner(lock) || rt_mutex_owner(lock) == current); + + spin_lock(¤t->pi_lock); + current->pi_locked_by = current; + + plist_node_init(&waiter->list_entry, current->prio); + plist_node_init(&waiter->pi_list_entry, current->prio); + + /* Get the top priority waiter of the lock: */ + if (rt_mutex_has_waiters(lock)) + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&waiter->list_entry, &lock->wait_list); + + current->pi_blocked_on = waiter; + + /* Lock the full chain: */ + res = lock_pi_chain(lock, waiter, &lock_chain, 0, detect_deadlock); + + /* Drop the conflicts lock before adjusting: */ + spin_unlock(&pi_conflicts_lock); + + if (likely(res == 1)) + adjust_pi_chain(lock, waiter, top_waiter, &lock_chain); + + out_unlock_chain_pi: + unlock_pi_chain(&lock_chain); + out_unlock_pi: + current->pi_locked_by = NULL; + spin_unlock(¤t->pi_lock); + out: + return res; +} + +/* * Wake up the next waiter on the lock. * * Remove the top waiter from the current tasks waiter list and from @@ -641,11 +656,11 @@ static int remove_waiter(struct rt_mutex spin_lock(&pi_conflicts_lock); + spin_lock(&lock->wait_lock); + spin_lock(¤t->pi_lock); current->pi_locked_by = current; - spin_lock(&lock->wait_lock); - /* We might have been woken up: */ if (!waiter->task) { spin_unlock(&pi_conflicts_lock); @@ -709,7 +724,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, BUG_ON(rt_mutex_owner(lock) == current); - set_task_state(current, state); + set_current_state(state); /* Setup the timer, when timeout != NULL */ if (unlikely(timeout)) @@ -743,10 +758,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, if (!waiter.task) { ret = task_blocks_on_rt_mutex(lock, &waiter, detect_deadlock __IP__); - if (ret == -EDEADLK) + /* got the lock or deadlock: */ + if (ret == 0 || ret == -EDEADLK) break; - if (ret == -EBUSY) - continue; + ret = 0; } spin_unlock_irqrestore(&lock->wait_lock, flags); @@ -757,10 +772,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, schedule_rt_mutex(lock); spin_lock_irqsave(&lock->wait_lock, flags); - set_task_state(current, state); + set_current_state(state); } - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); if (unlikely(waiter.task)) remove_waiter(lock, &waiter __IP__); @@ -806,11 +821,9 @@ rt_mutex_slowtrylock(struct rt_mutex *lo ret = try_to_take_rt_mutex(lock __IP__); /* * try_to_take_rt_mutex() sets the lock waiters - * bit. We might be the only waiter. Check if this - * needs to be cleaned up. + * bit unconditionally. Clean this up. */ - if (!ret) - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock); } spin_unlock_irqrestore(&lock->wait_lock, flags); diff -puN kernel/rtmutex_common.h~pi-futex-v3 kernel/rtmutex_common.h --- devel/kernel/rtmutex_common.h~pi-futex-v3 2006-03-26 15:35:35.000000000 -0800 +++ devel-akpm/kernel/rtmutex_common.h 2006-03-26 15:35:35.000000000 -0800 @@ -98,18 +98,18 @@ task_top_pi_waiter(struct task_struct *p static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { return (struct task_struct *) - ((unsigned long)((lock)->owner) & ~RT_MUTEX_OWNER_MASKALL); + ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); } static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) { return (struct task_struct *) - ((unsigned long)((lock)->owner) & ~RT_MUTEX_HAS_WAITERS); + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) { - return ((unsigned long)((lock)->owner) & RT_MUTEX_OWNER_PENDING); + return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; } /* diff -puN lib/plist.c~pi-futex-v3 lib/plist.c --- devel/lib/plist.c~pi-futex-v3 2006-03-26 15:35:35.000000000 -0800 +++ devel-akpm/lib/plist.c 2006-03-26 15:35:35.000000000 -0800 @@ -38,7 +38,7 @@ void plist_add(struct plist_node *node, WARN_ON(!plist_node_empty(node)); - list_for_each_entry(iter, &head->prio_list, plist.prio_list) + list_for_each_entry(iter, &head->prio_list, plist.prio_list) { if (node->prio < iter->prio) goto lt_prio; else if (node->prio == iter->prio) { @@ -46,6 +46,7 @@ void plist_add(struct plist_node *node, struct plist_node, plist.prio_list); goto eq_prio; } + } lt_prio: list_add_tail(&node->plist.prio_list, &iter->plist.prio_list); _