From: Mikulas Patocka <mpatocka@redhat.com>

Barrier support.

Barriers are submitted to a worker thread that issues them in-order.

__split_and_process_bio function is modified that when it sees a barrier
request, it waits for all pending IO before the request, then submits
the barrier and waits for it (we must wait, otherwise it could be
intermixed with following requests).

Errors from the barrier request are recorded in per-device barrier_error
variable. There may be only one barrier request in progress, so using
a per-device variable is correct.

The barrier request is converted to non-barrier request when sending it
to the underlying device.

This patch guarantees correct barrier behavior if the underlying device
doesn't perform write-back caching. The same requirement existed before
barriers were supported in dm.

Bottom layer barrier support (sending barriers by target drivers) and
handling devices with write-back caches will be done in further patches.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm.c |   85 +++++++++++++++++++++++++++++++++++++++++---------------
 1 files changed, 63 insertions(+), 22 deletions(-)

Index: linux-2.6.29/drivers/md/dm.c
===================================================================
--- linux-2.6.29.orig/drivers/md/dm.c	2009-04-01 13:51:53.000000000 +0100
+++ linux-2.6.29/drivers/md/dm.c	2009-04-01 13:51:55.000000000 +0100
@@ -125,6 +125,11 @@ struct mapped_device {
 	spinlock_t deferred_lock;
 
 	/*
+	 * An error from the barrier request currently being processed.
+	 */
+	int barrier_error;
+
+	/*
 	 * Processing queue (flush/barriers)
 	 */
 	struct workqueue_struct *wq;
@@ -425,6 +430,10 @@ static void end_io_acct(struct dm_io *io
 	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
 	part_stat_unlock();
 
+	/*
+	 * after this is decremented, the bio must not be touched if it is
+	 * barrier bio
+	 */
 	dm_disk(md)->part0.in_flight = pending =
 		atomic_dec_return(&md->pending);
 
@@ -528,25 +537,33 @@ static void dec_pending(struct dm_io *io
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md))
-				bio_list_add(&md->deferred, io->bio);
+				bio_list_add_head(&md->deferred, io->bio);
 			else
 				/* noflush suspend was interrupted. */
 				io->error = -EIO;
 			spin_unlock_irqrestore(&md->deferred_lock, flags);
 		}
 
-		end_io_acct(io);
-
 		io_error = io->error;
 		bio = io->bio;
 
-		free_io(md, io);
-
-		if (io_error != DM_ENDIO_REQUEUE) {
-			trace_block_bio_complete(md->queue, bio);
-
-			bio_endio(bio, io_error);
+		if (bio_barrier(bio)) {
+			/*
+			 * There could be just one barrier request, so we use
+			 * per-device variable for error reporting is OK.
+			 * Note that you can't touch the bio after end_io_acct
+			 */
+			md->barrier_error = io_error;
+			end_io_acct(io);
+		} else {
+			end_io_acct(io);
+
+			if (io_error != DM_ENDIO_REQUEUE) {
+				trace_block_bio_complete(md->queue, bio);
+				bio_endio(bio, io_error);
+			}
 		}
+		free_io(md, io);
 	}
 }
 
@@ -688,7 +705,7 @@ static struct bio *split_bvec(struct bio
 
 	clone->bi_sector = sector;
 	clone->bi_bdev = bio->bi_bdev;
-	clone->bi_rw = bio->bi_rw;
+	clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER);
 	clone->bi_vcnt = 1;
 	clone->bi_size = to_bytes(len);
 	clone->bi_io_vec->bv_offset = offset;
@@ -709,6 +726,7 @@ static struct bio *clone_bio(struct bio 
 
 	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 	__bio_clone(clone, bio);
+	clone->bi_rw &= ~(1 << BIO_RW_BARRIER);
 	clone->bi_destructor = dm_bio_destructor;
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
@@ -829,7 +847,10 @@ static void __split_and_process_bio(stru
 
 	ci.map = dm_get_table(md);
 	if (unlikely(!ci.map)) {
-		bio_io_error(bio);
+		if (!bio_barrier(bio))
+			bio_io_error(bio);
+		else
+			md->barrier_error = -EIO;
 		return;
 	}
 
@@ -913,15 +934,6 @@ static int dm_request(struct request_que
 	struct mapped_device *md = q->queuedata;
 	int cpu;
 
-	/*
-	 * There is no use in forwarding any barrier request since we can't
-	 * guarantee it is (or can be) handled by the targets correctly.
-	 */
-	if (unlikely(bio_barrier(bio))) {
-		bio_endio(bio, -EOPNOTSUPP);
-		return 0;
-	}
-
 	down_read(&md->io_lock);
 
 	cpu = part_stat_lock();
@@ -933,7 +945,8 @@ static int dm_request(struct request_que
 	 * If we're suspended or the thread is processing barriers
 	 * we have to queue this io for later.
 	 */
-	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))) {
+	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
+	    unlikely(bio_barrier(bio))) {
 		up_read(&md->io_lock);
 
 		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -1397,6 +1410,12 @@ static int dm_wait_for_completion(struct
 	return r;
 }
 
+static int dm_flush(struct mapped_device *md)
+{
+	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+	return 0;
+}
+
 /*
  * Process the deferred bios
  */
@@ -1420,8 +1439,30 @@ static void dm_wq_work(struct work_struc
 			break;
 		}
 
-		__split_and_process_bio(md, c);
+		if (!bio_barrier(c))
+			__split_and_process_bio(md, c);
+		else {
+			int error = dm_flush(md);
+			if (unlikely(error)) {
+				bio_endio(c, error);
+				goto next_bio;
+			}
+			if (bio_empty_barrier(c)) {
+				bio_endio(c, 0);
+				goto next_bio;
+			}
+
+			__split_and_process_bio(md, c);
+
+			error = dm_flush(md);
+			if (!error && md->barrier_error)
+				error = md->barrier_error;
+
+			if (md->barrier_error != DM_ENDIO_REQUEUE)
+				bio_endio(c, error);
+		}
 
+next_bio:
 		down_write(&md->io_lock);
 	}
 }