From: Oleg Nesterov CPU_0 CPU_1 release_task: posix_timer_fn: write_lock(tasklist); spin_lock(timer->it_lock); exit_timers: send_sigqueue: spin_lock(timer->it_lock) read_lock(tasklist); Actually, it is a bit worse. If posix timer starts between tasklist locking and del_timer_sync() call this deadlock will happen because of TIMER_RETRY logic. With this patch posix_timer_event() detects the exiting thread group and aborts the sending of signal (taking tasklist_lock). To simplify the code tasklist locking moved from send_{,group_}sigqueue to the posix_timer_event(). Yes, this is ugly and that is why I hesitated so long before posting this patch. But I don't see the simple and correct fix. The lock ordering is not the only problem. In fact we don't need to take the ->it_lock in the itimer_delete() at all. The problem is that itimer_delete() can't delete k_itimer.it.real.timer while holding tasklist_lock, because this lock will prevent (without this patch) the completition of posix_timer_fn(). It is unsafe to unlock tasklist in __exit_signal() before calling exit_itimers(), and it is too late to call exit_itimers() after unlock_irq(&tasklist_lock) in release_task(), at this moment we don't have ->signal/->sighand already. I believe the only sane fix would be to eliminate tasklist_lock from signal sending path (see the patches from Paul and Thomas), but this is hard and (I think) needs more working/testing. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/posix-timers.c | 46 ++++++++++++++++++++++++++++++++++------------ kernel/signal.c | 20 +++----------------- 2 files changed, 37 insertions(+), 29 deletions(-) diff -puN kernel/posix-timers.c~fix-exit_itimers-vs-posix_timer_event-ab-ba-deadlock kernel/posix-timers.c --- devel/kernel/posix-timers.c~fix-exit_itimers-vs-posix_timer_event-ab-ba-deadlock 2005-09-25 15:52:32.000000000 -0700 +++ devel-akpm/kernel/posix-timers.c 2005-09-25 16:04:36.000000000 -0700 @@ -411,6 +411,10 @@ exit: int posix_timer_event(struct k_itimer *timr,int si_private) { + struct task_struct *leader, *zombie; + int (*send_actor)(int, struct sigqueue *, struct task_struct *); + int ret; + memset(&timr->sigq->info, 0, sizeof(siginfo_t)); timr->sigq->info.si_sys_private = si_private; /* @@ -428,22 +432,40 @@ int posix_timer_event(struct k_itimer *t timr->sigq->info.si_tid = timr->it_id; timr->sigq->info.si_value = timr->it_sigev_value; + /* + * We are locking ->it_lock + tasklist_lock backwards + * from release_task()->exit_itimers(), beware deadlock. + */ + leader = timr->it_process->group_leader; + while (unlikely(!read_trylock(&tasklist_lock))) { + if (leader->flags & PF_EXITING) { + smp_rmb(); + if (thread_group_empty(leader)) + return 0; + } + cpu_relax(); + } + + zombie = NULL; + send_actor = send_group_sigqueue; if (timr->it_sigev_notify & SIGEV_THREAD_ID) { - struct task_struct *leader; - int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); + if (unlikely(timr->it_process->flags & PF_EXITING)) { + zombie = timr->it_process; + timr->it_process = leader; + timr->it_sigev_notify = SIGEV_SIGNAL; + } else + send_actor = send_sigqueue; + } - if (likely(ret >= 0)) - return ret; + ret = send_actor(timr->it_sigev_signo, timr->sigq, + timr->it_process); - timr->it_sigev_notify = SIGEV_SIGNAL; - leader = timr->it_process->group_leader; - put_task_struct(timr->it_process); - timr->it_process = leader; - } + read_unlock(&tasklist_lock); - return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); + if (unlikely(zombie != NULL)) + put_task_struct(zombie); + + return ret; } EXPORT_SYMBOL_GPL(posix_timer_event); diff -puN kernel/signal.c~fix-exit_itimers-vs-posix_timer_event-ab-ba-deadlock kernel/signal.c --- devel/kernel/signal.c~fix-exit_itimers-vs-posix_timer_event-ab-ba-deadlock 2005-09-25 15:52:32.000000000 -0700 +++ devel-akpm/kernel/signal.c 2005-09-25 16:04:36.000000000 -0700 @@ -1364,22 +1364,14 @@ send_sigqueue(int sig, struct sigqueue * int ret = 0; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - read_lock(&tasklist_lock); - - if (unlikely(p->flags & PF_EXITING)) { - ret = -1; - goto out_err; - } - spin_lock_irqsave(&p->sighand->siglock, flags); if (unlikely(!list_empty(&q->list))) { + BUG_ON(q->info.si_code != SI_TIMER); /* * If an SI_TIMER entry is already queue just increment * the overrun count. */ - if (q->info.si_code != SI_TIMER) - BUG(); q->info.si_overrun++; goto out; } @@ -1397,9 +1389,6 @@ send_sigqueue(int sig, struct sigqueue * out: spin_unlock_irqrestore(&p->sighand->siglock, flags); -out_err: - read_unlock(&tasklist_lock); - return ret; } @@ -1410,7 +1399,6 @@ send_group_sigqueue(int sig, struct sigq int ret = 0; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - read_lock(&tasklist_lock); spin_lock_irqsave(&p->sighand->siglock, flags); handle_stop_signal(sig, p); @@ -1421,13 +1409,12 @@ send_group_sigqueue(int sig, struct sigq } if (unlikely(!list_empty(&q->list))) { + BUG_ON(q->info.si_code != SI_TIMER); /* * If an SI_TIMER entry is already queue just increment * the overrun count. Other uses should not try to * send the signal multiple times. */ - if (q->info.si_code != SI_TIMER) - BUG(); q->info.si_overrun++; goto out; } @@ -1444,8 +1431,7 @@ send_group_sigqueue(int sig, struct sigq __group_complete_signal(sig, p); out: spin_unlock_irqrestore(&p->sighand->siglock, flags); - read_unlock(&tasklist_lock); - return(ret); + return ret; } /* _