From: Ingo Molnar fix potential swap-prefetch deadlock: swapped.lock must be taken in an irq-safe manner, because it can be taken while holding an irq-safe lock in the following code sequence: [] lockdep_acquire+0x69/0x82 [] _spin_lock+0x21/0x2f [] add_to_swapped_list+0x1f/0x13b [] remove_mapping+0x84/0xc3 [] shrink_inactive_list+0x3fb/0x705 [] shrink_zone+0xb9/0xd8 [] kswapd+0x293/0x38a [] kthread+0xa6/0xd3 found by the locking correctness validator. The full validator output is: ====================================================== [ BUG: hard-safe -> hard-unsafe lock order detected! ] ------------------------------------------------------ kswapd0/1173 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: (swapped.lock){--..}, at: [] add_to_swapped_list+0x1f/0x13b and this task is already holding: (swapper_space.tree_lock){+...}, at: [] remove_mapping+0x1c/0xc3 which would create a new lock dependency: (swapper_space.tree_lock){+...} -> (swapped.lock){--..} but this new dependency connects a hard-irq-safe lock: (swapper_space.tree_lock){+...} ... which became hard-irq-safe at: [] lockdep_acquire+0x69/0x82 [] _write_lock_irqsave+0x2a/0x3a [] test_clear_page_writeback+0x48/0xbc [] rotate_reclaimable_page+0x8b/0xb4 [] end_page_writeback+0x18/0x45 [] end_swap_bio_write+0x28/0x36 [] bio_endio+0x66/0x6e [] __end_that_request_first+0x23c/0x559 [] end_that_request_first+0xb/0xd [] ide_end_request+0xa8/0xf1 [] ide_dma_intr+0x54/0x8f [] ide_intr+0x147/0x1a7 [] handle_IRQ_event+0x1f/0x4f [] __do_IRQ+0xa6/0x111 [] do_IRQ+0x8c/0xad to a hard-irq-unsafe lock: (swapped.lock){--..} ... which became hard-irq-unsafe at: ... [] lockdep_acquire+0x69/0x82 [] _spin_lock+0x21/0x2f [] kprefetchd+0x3eb/0x40b [] kthread+0xa6/0xd3 [] kernel_thread_helper+0x5/0xb which could potentially lead to deadlocks! other info that might help us debug this: 1 locks held by kswapd0/1173: #0: (swapper_space.tree_lock){+...}, at: [] remove_mapping+0x1c/0xc3 the hard-irq-safe lock's dependencies: -> (swapper_space.tree_lock){+...} ops: 121 { initial-use at: [] lockdep_acquire+0x69/0x82 [] _write_lock_irq+0x27/0x35 [] __add_to_swap_cache+0x65/0x149 [] move_to_swap_cache+0x14/0x62 [] shmem_writepage+0x10c/0x1b5 [] pageout+0x119/0x1cc [] shrink_inactive_list+0x327/0x705 [] shrink_zone+0xb9/0xd8 [] try_to_free_pages+0x142/0x221 [] __alloc_pages+0x1d9/0x325 [] generic_file_buffered_write+0x189/0x5cd [] __generic_file_aio_write_nolock+0x450/0x48d [] generic_file_aio_write+0x6e/0xc1 [] ext3_file_write+0x1a/0x8b [] do_sync_write+0xb1/0xe6 [] vfs_write+0xcd/0x176 [] sys_write+0x3b/0x71 [] syscall_call+0x7/0xb in-hardirq-W at: [] lockdep_acquire+0x69/0x82 [] _write_lock_irqsave+0x2a/0x3a [] test_clear_page_writeback+0x48/0xbc [] rotate_reclaimable_page+0x8b/0xb4 [] end_page_writeback+0x18/0x45 [] end_swap_bio_write+0x28/0x36 [] bio_endio+0x66/0x6e [] __end_that_request_first+0x23c/0x559 [] end_that_request_first+0xb/0xd [] ide_end_request+0xa8/0xf1 [] ide_dma_intr+0x54/0x8f [] ide_intr+0x147/0x1a7 [] handle_IRQ_event+0x1f/0x4f [] __do_IRQ+0xa6/0x111 [] do_IRQ+0x8c/0xad } ... key at: [] swapper_space+0x24/0xfc the hard-irq-unsafe lock's dependencies: -> (swapped.lock){--..} ops: 2 { initial-use at: [] lockdep_acquire+0x69/0x82 [] _spin_lock+0x21/0x2f [] kprefetchd+0x3eb/0x40b [] kthread+0xa6/0xd3 [] kernel_thread_helper+0x5/0xb softirq-on-W at: [] lockdep_acquire+0x69/0x82 [] _spin_lock+0x21/0x2f [] kprefetchd+0x3eb/0x40b [] kthread+0xa6/0xd3 [] kernel_thread_helper+0x5/0xb hardirq-on-W at: [] lockdep_acquire+0x69/0x82 [] _spin_lock+0x21/0x2f [] kprefetchd+0x3eb/0x40b [] kthread+0xa6/0xd3 [] kernel_thread_helper+0x5/0xb } ... key at: [] swapped+0x18/0x60 stack backtrace: [] show_trace+0xd/0xf [] dump_stack+0x15/0x17 [] check_usage+0x1f4/0x201 [] __lockdep_acquire+0x873/0xa40 [] lockdep_acquire+0x69/0x82 [] _spin_lock+0x21/0x2f [] add_to_swapped_list+0x1f/0x13b [] remove_mapping+0x84/0xc3 [] shrink_inactive_list+0x3fb/0x705 [] shrink_zone+0xb9/0xd8 [] kswapd+0x293/0x38a [] kthread+0xa6/0xd3 [] kernel_thread_helper+0x5/0xb Signed-off-by: Ingo Molnar Cc: Con Kolivas Signed-off-by: Andrew Morton --- mm/swap_prefetch.c | 17 +++++++++-------- 1 files changed, 9 insertions(+), 8 deletions(-) diff -puN mm/swap_prefetch.c~mm-implement-swap-prefetching-fix mm/swap_prefetch.c --- devel/mm/swap_prefetch.c~mm-implement-swap-prefetching-fix 2006-05-09 08:42:41.000000000 -0700 +++ devel-akpm/mm/swap_prefetch.c 2006-05-09 08:42:41.000000000 -0700 @@ -65,7 +65,7 @@ inline void delay_swap_prefetch(void) void add_to_swapped_list(struct page *page) { struct swapped_entry *entry; - unsigned long index; + unsigned long index, flags; int wakeup; if (!swap_prefetch) @@ -73,7 +73,7 @@ void add_to_swapped_list(struct page *pa wakeup = 0; - spin_lock(&swapped.lock); + spin_lock_irqsave(&swapped.lock, flags); if (swapped.count >= swapped.maxcount) { /* * We limit the number of entries to 2/3 of physical ram. @@ -112,7 +112,7 @@ void add_to_swapped_list(struct page *pa } out_locked: - spin_unlock(&swapped.lock); + spin_unlock_irqrestore(&swapped.lock, flags); /* Do the wakeup outside the lock to shorten lock hold time. */ if (wakeup) @@ -433,6 +433,7 @@ static enum trickle_return trickle_swap( { enum trickle_return ret = TRICKLE_DELAY; struct swapped_entry *entry; + unsigned long flags; /* * If laptop_mode is enabled don't prefetch to avoid hard drives @@ -451,10 +452,10 @@ static enum trickle_return trickle_swap( if (!prefetch_suitable()) break; - spin_lock(&swapped.lock); + spin_lock_irqsave(&swapped.lock, flags); if (list_empty(&swapped.list)) { ret = TRICKLE_FAILED; - spin_unlock(&swapped.lock); + spin_unlock_irqrestore(&swapped.lock, flags); break; } @@ -473,7 +474,7 @@ static enum trickle_return trickle_swap( * be a reason we could not swap them back in so * delay attempting further prefetching. */ - spin_unlock(&swapped.lock); + spin_unlock_irqrestore(&swapped.lock, flags); break; } @@ -484,12 +485,12 @@ static enum trickle_return trickle_swap( * not suitable for prefetching so skip it. */ entry = prev_swapped_entry(entry); - spin_unlock(&swapped.lock); + spin_unlock_irqrestore(&swapped.lock, flags); continue; } swp_entry = entry->swp_entry; entry = prev_swapped_entry(entry); - spin_unlock(&swapped.lock); + spin_unlock_irqrestore(&swapped.lock, flags); if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) break; _