From: Mikulas Patocka Make one kcopyd thread per device. The original shared kcopyd could deadlock. Configuration: -------------- A (dm-raid1) B (dm-raid1) C (any device) B is a part of the device A. C is a part of the device B. There may be other devices in the mirrors, but they are not relevant to this deadlock. Deadlock scenario: ------------------ Both mirror devices A and B are running a recovery. B's mempool "md->tio_pool" is empty. All the IO requests allocated from this pool belong to the region that is being synchronized, so they are held on ms->writes and ms->reads queues. A makes a kcopyd request to B during A's recovery. Stacktrace of A's "kmirrord" thread is: do_mirror _do_mirror do_recovery recover kcopyd_copy kcopyd receives the A's request and starts processing it: do_work process_jobs(&_io_jobs, run_io_job) run_io_job dm_io async_io dispatch_io do_region submit_bio generic_make_request ... submit BIO calls the B's request function q->make_request_fn dm_request (on device B) __split_bio __clone_and_map alloc_tio - alloc_tio waits, until some space is made in B's md->tio_pool Meanwhile, the device B is doing its own recovery work (sending requests on device C). B's "kmirrord" thread has this stacktrace: do_mirror _do_mirror do_recovery recover kcopyd_copy --- however kcopyd is blocked elsewhere, so it doesn't process the request immediatelly The deadlock: ------------- All B's requests are waiting for B's recovery of the region to complete. The B's recovery is waiting for kcopyd. kcopyd is waiting (on behalf of A's request) until some B's request finishes and makes a room in B's md->tio_pool mempool. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/kcopyd.c | 132 ++++++++++++++++++++++++++++------------------------ 1 files changed, 73 insertions(+), 59 deletions(-) Index: linux-2.6.25/drivers/md/kcopyd.c =================================================================== --- linux-2.6.25.orig/drivers/md/kcopyd.c 2008-04-24 18:00:29.000000000 +0100 +++ linux-2.6.25/drivers/md/kcopyd.c 2008-04-24 18:00:31.000000000 +0100 @@ -26,14 +26,6 @@ #include "kcopyd.h" #include "dm.h" -static struct workqueue_struct *_kcopyd_wq; -static struct work_struct _kcopyd_work; - -static void wake(void) -{ - queue_work(_kcopyd_wq, &_kcopyd_work); -} - /*----------------------------------------------------------------- * Each kcopyd client has its own little pool of preallocated * pages for kcopyd io. @@ -50,8 +42,30 @@ struct dm_kcopyd_client { wait_queue_head_t destroyq; atomic_t nr_jobs; + + struct workqueue_struct *kcopyd_wq; + struct work_struct kcopyd_work; + +/* + * We maintain three lists of jobs: + * + * i) jobs waiting for pages + * ii) jobs that have pages, and are waiting for the io to be issued. + * iii) jobs that have completed. + * + * All three of these are protected by job_lock. + */ + spinlock_t job_lock; + struct list_head complete_jobs; + struct list_head io_jobs; + struct list_head pages_jobs; }; +static void wake(struct dm_kcopyd_client *kc) +{ + queue_work(kc->kcopyd_wq, &kc->kcopyd_work); +} + static struct page_list *alloc_pl(void) { struct page_list *pl; @@ -209,21 +223,6 @@ struct kcopyd_job { static struct kmem_cache *_job_cache; static mempool_t *_job_pool; -/* - * We maintain three lists of jobs: - * - * i) jobs waiting for pages - * ii) jobs that have pages, and are waiting for the io to be issued. - * iii) jobs that have completed. - * - * All three of these are protected by job_lock. - */ -static DEFINE_SPINLOCK(_job_lock); - -static LIST_HEAD(_complete_jobs); -static LIST_HEAD(_io_jobs); -static LIST_HEAD(_pages_jobs); - static int jobs_init(void) { _job_cache = KMEM_CACHE(kcopyd_job, 0); @@ -241,10 +240,6 @@ static int jobs_init(void) static void jobs_exit(void) { - BUG_ON(!list_empty(&_complete_jobs)); - BUG_ON(!list_empty(&_io_jobs)); - BUG_ON(!list_empty(&_pages_jobs)); - mempool_destroy(_job_pool); kmem_cache_destroy(_job_cache); _job_pool = NULL; @@ -255,18 +250,19 @@ static void jobs_exit(void) * Functions to push and pop a job onto the head of a given job * list. */ -static struct kcopyd_job *pop(struct list_head *jobs) +static struct kcopyd_job *pop(struct list_head *jobs, + struct dm_kcopyd_client *kc) { struct kcopyd_job *job = NULL; unsigned long flags; - spin_lock_irqsave(&_job_lock, flags); + spin_lock_irqsave(&kc->job_lock, flags); if (!list_empty(jobs)) { job = list_entry(jobs->next, struct kcopyd_job, list); list_del(&job->list); } - spin_unlock_irqrestore(&_job_lock, flags); + spin_unlock_irqrestore(&kc->job_lock, flags); return job; } @@ -274,10 +270,11 @@ static struct kcopyd_job *pop(struct lis static void push(struct list_head *jobs, struct kcopyd_job *job) { unsigned long flags; + struct dm_kcopyd_client *kc = job->kc; - spin_lock_irqsave(&_job_lock, flags); + spin_lock_irqsave(&kc->job_lock, flags); list_add_tail(&job->list, jobs); - spin_unlock_irqrestore(&_job_lock, flags); + spin_unlock_irqrestore(&kc->job_lock, flags); } /* @@ -310,6 +307,7 @@ static int run_complete_job(struct kcopy static void complete_io(unsigned long error, void *context) { struct kcopyd_job *job = (struct kcopyd_job *) context; + struct dm_kcopyd_client *kc = job->kc; if (error) { if (job->rw == WRITE) @@ -318,21 +316,21 @@ static void complete_io(unsigned long er job->read_err = 1; if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { - push(&_complete_jobs, job); - wake(); + push(&kc->complete_jobs, job); + wake(kc); return; } } if (job->rw == WRITE) - push(&_complete_jobs, job); + push(&kc->complete_jobs, job); else { job->rw = WRITE; - push(&_io_jobs, job); + push(&kc->io_jobs, job); } - wake(); + wake(kc); } /* @@ -369,7 +367,7 @@ static int run_pages_job(struct kcopyd_j r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); if (!r) { /* this job is ready for io */ - push(&_io_jobs, job); + push(&job->kc->io_jobs, job); return 0; } @@ -384,12 +382,13 @@ static int run_pages_job(struct kcopyd_j * Run through a list for as long as possible. Returns the count * of successful jobs. */ -static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) +static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, + int (*fn) (struct kcopyd_job *)) { struct kcopyd_job *job; int r, count = 0; - while ((job = pop(jobs))) { + while ((job = pop(jobs, kc))) { r = fn(job); @@ -399,7 +398,7 @@ static int process_jobs(struct list_head job->write_err = (unsigned long) -1L; else job->read_err = 1; - push(&_complete_jobs, job); + push(&kc->complete_jobs, job); break; } @@ -421,8 +420,11 @@ static int process_jobs(struct list_head /* * kcopyd does this every time it's woken up. */ -static void do_work(struct work_struct *ignored) +static void do_work(struct work_struct *work) { + struct dm_kcopyd_client *kc = container_of(work, + struct dm_kcopyd_client, kcopyd_work); + /* * The order that these are called is *very* important. * complete jobs can free some pages for pages jobs. @@ -430,9 +432,9 @@ static void do_work(struct work_struct * * list. io jobs call wake when they complete and it all * starts again. */ - process_jobs(&_complete_jobs, run_complete_job); - process_jobs(&_pages_jobs, run_pages_job); - process_jobs(&_io_jobs, run_io_job); + process_jobs(&kc->complete_jobs, kc, run_complete_job); + process_jobs(&kc->pages_jobs, kc, run_pages_job); + process_jobs(&kc->io_jobs, kc, run_io_job); } /* @@ -442,9 +444,10 @@ static void do_work(struct work_struct * */ static void dispatch_job(struct kcopyd_job *job) { - atomic_inc(&job->kc->nr_jobs); - push(&_pages_jobs, job); - wake(); + struct dm_kcopyd_client *kc = job->kc; + atomic_inc(&kc->nr_jobs); + push(&kc->pages_jobs, job); + wake(kc); } #define SUB_JOB_SIZE 128 @@ -625,15 +628,7 @@ static int kcopyd_init(void) return r; } - _kcopyd_wq = create_singlethread_workqueue("kcopyd"); - if (!_kcopyd_wq) { - jobs_exit(); - mutex_unlock(&kcopyd_init_lock); - return -ENOMEM; - } - kcopyd_clients++; - INIT_WORK(&_kcopyd_work, do_work); mutex_unlock(&kcopyd_init_lock); return 0; } @@ -644,8 +639,6 @@ static void kcopyd_exit(void) kcopyd_clients--; if (!kcopyd_clients) { jobs_exit(); - destroy_workqueue(_kcopyd_wq); - _kcopyd_wq = NULL; } mutex_unlock(&kcopyd_init_lock); } @@ -662,15 +655,31 @@ int dm_kcopyd_client_create(unsigned int kc = kmalloc(sizeof(*kc), GFP_KERNEL); if (!kc) { + r = -ENOMEM; kcopyd_exit(); - return -ENOMEM; + return r; } spin_lock_init(&kc->lock); + spin_lock_init(&kc->job_lock); + INIT_LIST_HEAD(&kc->complete_jobs); + INIT_LIST_HEAD(&kc->io_jobs); + INIT_LIST_HEAD(&kc->pages_jobs); + + INIT_WORK(&kc->kcopyd_work, do_work); + kc->kcopyd_wq = create_singlethread_workqueue("kcopyd"); + if (!kc->kcopyd_wq) { + r = -ENOMEM; + kfree(kc); + kcopyd_exit(); + return r; + } + kc->pages = NULL; kc->nr_pages = kc->nr_free_pages = 0; r = client_alloc_pages(kc, nr_pages); if (r) { + destroy_workqueue(kc->kcopyd_wq); kfree(kc); kcopyd_exit(); return r; @@ -680,6 +689,7 @@ int dm_kcopyd_client_create(unsigned int if (IS_ERR(kc->io_client)) { r = PTR_ERR(kc->io_client); client_free_pages(kc); + destroy_workqueue(kc->kcopyd_wq); kfree(kc); kcopyd_exit(); return r; @@ -699,6 +709,10 @@ void dm_kcopyd_client_destroy(struct dm_ /* Wait for completion of all jobs submitted by this client. */ wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); + BUG_ON(!list_empty(&kc->complete_jobs)); + BUG_ON(!list_empty(&kc->io_jobs)); + BUG_ON(!list_empty(&kc->pages_jobs)); + destroy_workqueue(kc->kcopyd_wq); dm_io_client_destroy(kc->io_client); client_free_pages(kc); client_del(kc);