Index: linux-2.6.8.1-ck9/drivers/block/ll_rw_blk.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/block/ll_rw_blk.c	2004-08-15 14:08:05.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/block/ll_rw_blk.c	2004-10-02 11:38:02.986002354 +1000
@@ -263,6 +263,45 @@ void blk_queue_make_request(request_queu
 EXPORT_SYMBOL(blk_queue_make_request);
 
 /**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q:     the request queue
+ * @flag:  see below
+ *
+ * Description:
+ *   For journalled file systems, doing ordered writes on a commit
+ *   block instead of explicitly doing wait_on_buffer (which is bad
+ *   for performance) can be a big win. Block drivers supporting this
+ *   feature should call this function and indicate so.
+ *
+ **/
+void blk_queue_ordered(request_queue_t *q, int flag)
+{
+	if (flag)
+		set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+	else
+		clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_ordered);
+
+/**
+ * blk_queue_issue_flush_fn - set function for issuing a flush
+ * @q:     the request queue
+ * @iff:   the function to be called issuing the flush
+ *
+ * Description:
+ *   If a driver supports issuing a flush command, the support is notified
+ *   to the block layer by defining it through this call.
+ *
+ **/
+void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
+{
+	q->issue_flush_fn = iff;
+}
+
+EXPORT_SYMBOL(blk_queue_issue_flush_fn);
+
+/**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q:  the request queue for the device
  * @dma_addr:   bus address limit
@@ -482,15 +521,14 @@ struct request *blk_queue_find_tag(reque
 EXPORT_SYMBOL(blk_queue_find_tag);
 
 /**
- * blk_queue_free_tags - release tag maintenance info
+ * __blk_queue_free_tags - release tag maintenance info
  * @q:  the request queue for the device
  *
  *  Notes:
  *    blk_cleanup_queue() will take care of calling this function, if tagging
- *    has been used. So there's usually no need to call this directly, unless
- *    tagging is just being disabled but the queue remains in function.
+ *    has been used. So there's no need to call this directly.
  **/
-void blk_queue_free_tags(request_queue_t *q)
+static void __blk_queue_free_tags(request_queue_t *q)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
 
@@ -514,12 +552,27 @@ void blk_queue_free_tags(request_queue_t
 	q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
 }
 
+/**
+ * blk_queue_free_tags - release tag maintenance info
+ * @q:  the request queue for the device
+ *
+ *  Notes:
+ *	This is used to disabled tagged queuing to a device, yet leave
+ *	queue in function.
+ **/
+void blk_queue_free_tags(request_queue_t *q)
+{
+	clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+}
+
 EXPORT_SYMBOL(blk_queue_free_tags);
 
 static int
 init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
 {
 	int bits, i;
+	struct request **tag_index;
+	unsigned long *tag_map;
 
 	if (depth > q->nr_requests * 2) {
 		depth = q->nr_requests * 2;
@@ -527,32 +580,31 @@ init_tag_map(request_queue_t *q, struct 
 				__FUNCTION__, depth);
 	}
 
-	tags->tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
-	if (!tags->tag_index)
+	tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
+	if (!tag_index)
 		goto fail;
 
 	bits = (depth / BLK_TAGS_PER_LONG) + 1;
-	tags->tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC);
-	if (!tags->tag_map)
+	tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC);
+	if (!tag_map)
 		goto fail;
 
-	memset(tags->tag_index, 0, depth * sizeof(struct request *));
-	memset(tags->tag_map, 0, bits * sizeof(unsigned long));
+	memset(tag_index, 0, depth * sizeof(struct request *));
+	memset(tag_map, 0, bits * sizeof(unsigned long));
 	tags->max_depth = depth;
 	tags->real_max_depth = bits * BITS_PER_LONG;
+	tags->tag_index = tag_index;
+	tags->tag_map = tag_map;
 
 	/*
 	 * set the upper bits if the depth isn't a multiple of the word size
 	 */
 	for (i = depth; i < bits * BLK_TAGS_PER_LONG; i++)
-		__set_bit(i, tags->tag_map);
+		__set_bit(i, tag_map);
 
-	INIT_LIST_HEAD(&tags->busy_list);
-	tags->busy = 0;
-	atomic_set(&tags->refcnt, 1);
 	return 0;
 fail:
-	kfree(tags->tag_index);
+	kfree(tag_index);
 	return -ENOMEM;
 }
 
@@ -564,13 +616,26 @@ fail:
 int blk_queue_init_tags(request_queue_t *q, int depth,
 			struct blk_queue_tag *tags)
 {
-	if (!tags) {
+	int rc;
+
+	BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
+
+	if (!tags && !q->queue_tags) {
 		tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
 		if (!tags)
 			goto fail;
 
 		if (init_tag_map(q, tags, depth))
 			goto fail;
+
+		INIT_LIST_HEAD(&tags->busy_list);
+		tags->busy = 0;
+		atomic_set(&tags->refcnt, 1);
+	} else if (q->queue_tags) {
+		if ((rc = blk_queue_resize_tags(q, depth)))
+			return rc;
+		set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+		return 0;
 	} else
 		atomic_inc(&tags->refcnt);
 
@@ -1335,8 +1400,8 @@ void blk_cleanup_queue(request_queue_t *
 	if (rl->rq_pool)
 		mempool_destroy(rl->rq_pool);
 
-	if (blk_queue_tagged(q))
-		blk_queue_free_tags(q);
+	if (q->queue_tags)
+		__blk_queue_free_tags(q);
 
 	kmem_cache_free(requestq_cachep, q);
 }
@@ -1925,10 +1990,11 @@ int blk_execute_rq(request_queue_t *q, s
 	}
 
 	rq->flags |= REQ_NOMERGE;
-	rq->waiting = &wait;
+	if (!rq->waiting)
+		rq->waiting = &wait;
 	elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
 	generic_unplug_device(q);
-	wait_for_completion(&wait);
+	wait_for_completion(rq->waiting);
 	rq->waiting = NULL;
 
 	if (rq->errors)
@@ -1939,6 +2005,72 @@ int blk_execute_rq(request_queue_t *q, s
 
 EXPORT_SYMBOL(blk_execute_rq);
 
+/**
+ * blkdev_issue_flush - queue a flush
+ * @bdev:	blockdev to issue flush for
+ * @error_sector:	error sector
+ *
+ * Description:
+ *    Issue a flush for the block device in question. Caller can supply
+ *    room for storing the error offset in case of a flush error, if they
+ *    wish to.  Caller must run wait_for_completion() on its own.
+ */
+int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+{
+	request_queue_t *q;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+	if (!q->issue_flush_fn)
+		return -EOPNOTSUPP;
+
+	return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
+}
+
+EXPORT_SYMBOL(blkdev_issue_flush);
+
+/**
+ * blkdev_scsi_issue_flush_fn - issue flush for SCSI devices
+ * @q:		device queue
+ * @disk:	gendisk
+ * @error_sector:	error offset
+ *
+ * Description:
+ *    Devices understanding the SCSI command set, can use this function as
+ *    a helper for issuing a cache flush. Note: driver is required to store
+ *    the error offset (in case of error flushing) in ->sector of struct
+ *    request.
+ */
+int blkdev_scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
+			       sector_t *error_sector)
+{
+	struct request *rq = blk_get_request(q, WRITE, __GFP_WAIT);
+	int ret;
+
+	rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER;
+	rq->sector = 0;
+	memset(rq->cmd, 0, sizeof(rq->cmd));
+	rq->cmd[0] = 0x35;
+	rq->cmd_len = 12;
+	rq->data = NULL;
+	rq->data_len = 0;
+	rq->timeout = 60 * HZ;
+
+	ret = blk_execute_rq(q, disk, rq);
+
+	if (ret && error_sector)
+		*error_sector = rq->sector;
+
+	blk_put_request(rq);
+	return ret;
+}
+
+EXPORT_SYMBOL(blkdev_scsi_issue_flush_fn);
+
 void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
 {
 	int rw = rq_data_dir(rq);
@@ -2192,7 +2324,7 @@ EXPORT_SYMBOL(__blk_attempt_remerge);
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
 	struct request *req, *freereq = NULL;
-	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
+	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err;
 	sector_t sector;
 
 	sector = bio->bi_sector;
@@ -2210,9 +2342,11 @@ static int __make_request(request_queue_
 
 	spin_lock_prefetch(q->queue_lock);
 
-	barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw);
-
-	ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
+	barrier = bio_barrier(bio);
+	if (barrier && !(q->queue_flags & (1 << QUEUE_FLAG_ORDERED))) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
 
 again:
 	spin_lock_irq(q->queue_lock);
@@ -2292,7 +2426,8 @@ get_rq:
 			/*
 			 * READA bit set
 			 */
-			if (ra)
+			err = -EWOULDBLOCK;
+			if (bio_rw_ahead(bio))
 				goto end_io;
 	
 			freereq = get_request_wait(q, rw);
@@ -2303,10 +2438,9 @@ get_rq:
 	req->flags |= REQ_CMD;
 
 	/*
-	 * inherit FAILFAST from bio and don't stack up
-	 * retries for read ahead
+	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
 	 */
-	if (ra || test_bit(BIO_RW_FAILFAST, &bio->bi_rw))	
+	if (bio_rw_ahead(bio) || bio_failfast(bio))
 		req->flags |= REQ_FAILFAST;
 
 	/*
@@ -2340,7 +2474,7 @@ out:
 	return 0;
 
 end_io:
-	bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK);
+	bio_endio(bio, nr_sectors << 9, err);
 	return 0;
 }
 
@@ -2647,10 +2781,17 @@ void blk_recalc_rq_sectors(struct reques
 static int __end_that_request_first(struct request *req, int uptodate,
 				    int nr_bytes)
 {
-	int total_bytes, bio_nbytes, error = 0, next_idx = 0;
+	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
 	/*
+	 * extend uptodate bool to allow < 0 value to be direct io error
+	 */
+	error = 0;
+	if (end_io_error(uptodate))
+		error = !uptodate ? -EIO : uptodate;
+
+	/*
 	 * for a REQ_BLOCK_PC request, we want to carry any eventual
 	 * sense key with us all the way through
 	 */
@@ -2658,7 +2799,6 @@ static int __end_that_request_first(stru
 		req->errors = 0;
 
 	if (!uptodate) {
-		error = -EIO;
 		if (blk_fs_request(req) && !(req->flags & REQ_QUIET))
 			printk("end_request: I/O error, dev %s, sector %llu\n",
 				req->rq_disk ? req->rq_disk->disk_name : "?",
@@ -2741,7 +2881,7 @@ static int __end_that_request_first(stru
 /**
  * end_that_request_first - end I/O on a request
  * @req:      the request being processed
- * @uptodate: 0 for I/O error
+ * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
  * @nr_sectors: number of sectors to end I/O on
  *
  * Description:
@@ -2762,7 +2902,7 @@ EXPORT_SYMBOL(end_that_request_first);
 /**
  * end_that_request_chunk - end I/O on a request
  * @req:      the request being processed
- * @uptodate: 0 for I/O error
+ * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
Index: linux-2.6.8.1-ck9/drivers/ide/ide.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/ide/ide.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/ide/ide.c	2004-10-02 11:38:02.988002042 +1000
@@ -437,6 +437,30 @@ u8 ide_dump_status (ide_drive_t *drive, 
 #endif	/* FANCY_STATUS_DUMPS */
 		printk("\n");
 	}
+	{
+		struct request *rq;
+		int opcode = 0x100;
+
+		spin_lock(&ide_lock);
+		rq = HWGROUP(drive)->rq;
+		spin_unlock(&ide_lock);
+		if (!rq)
+			goto out;
+		if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) {
+			char *args = rq->buffer;
+			if (args)
+				opcode = args[0];
+		} else if (rq->flags & REQ_DRIVE_TASKFILE) {
+			ide_task_t *args = rq->special;
+			if (args) {
+				task_struct_t *tf = (task_struct_t *) args->tfRegister;
+				opcode = tf->command;
+			}
+		}
+
+		printk("ide: failed opcode was %x\n", opcode);
+	}
+out:
 	local_irq_restore(flags);
 	return err;
 }
Index: linux-2.6.8.1-ck9/drivers/ide/ide-disk.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/ide/ide-disk.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/ide/ide-disk.c	2004-10-02 11:38:02.989001885 +1000
@@ -702,6 +702,37 @@ static u8 idedisk_dump_status (ide_drive
 	}
 #endif	/* FANCY_STATUS_DUMPS */
 	printk("\n");
+	{
+		struct request *rq;
+		unsigned char opcode = 0;
+		int found = 0;
+
+		spin_lock(&ide_lock);
+		rq = HWGROUP(drive)->rq;
+		spin_unlock(&ide_lock);
+		if (!rq)
+			goto out;
+		if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) {
+			char *args = rq->buffer;
+			if (args) {
+				opcode = args[0];
+				found = 1;
+			}
+		} else if (rq->flags & REQ_DRIVE_TASKFILE) {
+			ide_task_t *args = rq->special;
+			if (args) {
+				task_struct_t *tf = (task_struct_t *) args->tfRegister;
+				opcode = tf->command;
+				found = 1;
+			}
+		}
+		printk("ide: failed opcode was: ");
+		if (!found)
+			printk("unknown\n");
+		else
+			printk("0x%02x\n", opcode);
+	}
+out:
 	local_irq_restore(flags);
 	return err;
 }
@@ -1203,6 +1234,42 @@ static ide_proc_entry_t idedisk_proc[] =
 
 #endif	/* CONFIG_PROC_FS */
 
+static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk,
+			       sector_t *error_sector)
+{
+	ide_drive_t *drive = q->queuedata;
+	struct request *rq;
+	int ret;
+
+	if (!drive->wcache)
+		return 0;
+
+	rq = blk_get_request(q, WRITE, __GFP_WAIT);
+
+	memset(rq->cmd, 0, sizeof(rq->cmd));
+
+	if (ide_id_has_flush_cache_ext(drive->id) &&
+	    (drive->capacity64 >= (1UL << 28)))
+		rq->cmd[0] = WIN_FLUSH_CACHE_EXT;
+	else
+		rq->cmd[0] = WIN_FLUSH_CACHE;
+
+
+	rq->flags |= REQ_DRIVE_TASK | REQ_SOFTBARRIER;
+	rq->buffer = rq->cmd;
+
+	ret = blk_execute_rq(q, disk, rq);
+
+	/*
+	 * if we failed and caller wants error offset, get it
+	 */
+	if (ret && error_sector)
+		*error_sector = ide_get_error_location(drive, rq->cmd);
+
+	blk_put_request(rq);
+	return ret;
+}
+
 /*
  * This is tightly woven into the driver->do_special can not touch.
  * DON'T do it again until a total personality rewrite is committed.
@@ -1231,16 +1298,10 @@ static int set_nowerr(ide_drive_t *drive
 	return 0;
 }
 
-/* check if CACHE FLUSH (EXT) command is supported (bits defined in ATA-6) */
-#define ide_id_has_flush_cache(id)	((id)->cfs_enable_2 & 0x3000)
-
-/* some Maxtor disks have bit 13 defined incorrectly so check bit 10 too */
-#define ide_id_has_flush_cache_ext(id)	\
-	(((id)->cfs_enable_2 & 0x2400) == 0x2400)
-
 static int write_cache (ide_drive_t *drive, int arg)
 {
 	ide_task_t args;
+	int err;
 
 	if (!ide_id_has_flush_cache(drive->id))
 		return 1;
@@ -1251,7 +1312,10 @@ static int write_cache (ide_drive_t *dri
 	args.tfRegister[IDE_COMMAND_OFFSET]	= WIN_SETFEATURES;
 	args.command_type			= IDE_DRIVE_TASK_NO_DATA;
 	args.handler				= &task_no_data_intr;
-	(void) ide_raw_taskfile(drive, &args, NULL);
+
+	err = ide_raw_taskfile(drive, &args, NULL);
+	if (err)
+		return err;
 
 	drive->wcache = arg;
 	return 0;
@@ -1412,6 +1476,7 @@ static void idedisk_setup (ide_drive_t *
 {
 	struct hd_driveid *id = drive->id;
 	unsigned long long capacity;
+	int barrier;
 
 	idedisk_add_settings(drive);
 
@@ -1543,6 +1608,27 @@ static void idedisk_setup (ide_drive_t *
 		drive->wcache = 1;
 
 	write_cache(drive, 1);
+
+	/*
+	 * decide if we can sanely support flushes and barriers on
+	 * this drive. unfortunately not all drives advertise FLUSH_CACHE
+	 * support even if they support it. So assume FLUSH_CACHE is there
+	 * always. LBA48 drives are newer, so expect it to flag support
+	 * properly. We can safely support FLUSH_CACHE on lba48, if capacity
+	 * doesn't exceed lba28
+	 */
+	barrier = 1;
+	if (drive->addressing == 1) {
+		if (capacity > (1ULL << 28) && !ide_id_has_flush_cache_ext(id))
+			barrier = 0;
+	}
+
+	printk("%s: cache flushes %ssupported\n",
+		drive->name, barrier ? "" : "not ");
+	if (barrier) {
+		blk_queue_ordered(drive->queue, 1);
+		blk_queue_issue_flush_fn(drive->queue, idedisk_issue_flush);
+	}
 }
 
 static void ide_cacheflush_p(ide_drive_t *drive)
Index: linux-2.6.8.1-ck9/drivers/ide/ide-io.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/ide/ide-io.c	2004-06-16 17:35:36.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/ide/ide-io.c	2004-10-02 11:38:02.991001573 +1000
@@ -54,38 +54,77 @@
 #include <asm/io.h>
 #include <asm/bitops.h>
 
-/**
- *	ide_end_request		-	complete an IDE I/O
- *	@drive: IDE device for the I/O
- *	@uptodate: 
- *	@nr_sectors: number of sectors completed
- *
- *	This is our end_request wrapper function. We complete the I/O
- *	update random number input and dequeue the request, which if
- *	it was tagged may be out of order.
+static void ide_fill_flush_cmd(ide_drive_t *drive, struct request *rq)
+{
+	char *buf = rq->cmd;
+
+	/*
+	 * reuse cdb space for ata command
+	 */
+	memset(buf, 0, sizeof(rq->cmd));
+
+	rq->flags |= REQ_DRIVE_TASK | REQ_STARTED;
+	rq->buffer = buf;
+	rq->buffer[0] = WIN_FLUSH_CACHE;
+
+	if (ide_id_has_flush_cache_ext(drive->id) &&
+	    (drive->capacity64 >= (1UL << 28)))
+		rq->buffer[0] = WIN_FLUSH_CACHE_EXT;
+}
+
+/*
+ * preempt pending requests, and store this cache flush for immediate
+ * execution
  */
- 
-int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
+static struct request *ide_queue_flush_cmd(ide_drive_t *drive,
+					   struct request *rq, int post)
 {
-	struct request *rq;
-	unsigned long flags;
-	int ret = 1;
+	struct request *flush_rq = &HWGROUP(drive)->wrq;
 
-	spin_lock_irqsave(&ide_lock, flags);
-	rq = HWGROUP(drive)->rq;
+	/*
+	 * write cache disabled, clear the barrier bit and treat it like
+	 * an ordinary write
+	 */
+	if (!drive->wcache) {
+		rq->flags |= REQ_BAR_PREFLUSH;
+		return rq;
+	}
 
-	BUG_ON(!(rq->flags & REQ_STARTED));
+	ide_init_drive_cmd(flush_rq);
+	ide_fill_flush_cmd(drive, flush_rq);
 
-	if (!nr_sectors)
-		nr_sectors = rq->hard_cur_sectors;
+	flush_rq->special = rq;
+	flush_rq->nr_sectors = rq->nr_sectors;
+
+	if (!post) {
+		drive->doing_barrier = 1;
+		flush_rq->flags |= REQ_BAR_PREFLUSH;
+		blkdev_dequeue_request(rq);
+	} else
+		flush_rq->flags |= REQ_BAR_POSTFLUSH;
+
+	__elv_add_request(drive->queue, flush_rq, ELEVATOR_INSERT_FRONT, 0);
+	HWGROUP(drive)->rq = NULL;
+	return flush_rq;
+}
+
+static int __ide_end_request(ide_drive_t *drive, struct request *rq,
+			     int uptodate, int nr_sectors)
+{
+	int ret = 1;
+
+	BUG_ON(!(rq->flags & REQ_STARTED));
 
 	/*
 	 * if failfast is set on a request, override number of sectors and
 	 * complete the whole request right now
 	 */
-	if (blk_noretry_request(rq) && !uptodate)
+	if (blk_noretry_request(rq) && end_io_error(uptodate))
 		nr_sectors = rq->hard_nr_sectors;
 
+	if (!blk_fs_request(rq) && end_io_error(uptodate) && !rq->errors)
+		rq->errors = -EIO;
+
 	/*
 	 * decide whether to reenable DMA -- 3 is a random magic for now,
 	 * if we DMA timeout more than 3 times, just stay in PIO
@@ -97,15 +136,56 @@ int ide_end_request (ide_drive_t *drive,
 
 	if (!end_that_request_first(rq, uptodate, nr_sectors)) {
 		add_disk_randomness(rq->rq_disk);
+
+		if (blk_rq_tagged(rq))
+			blk_queue_end_tag(drive->queue, rq);
+
 		blkdev_dequeue_request(rq);
 		HWGROUP(drive)->rq = NULL;
 		end_that_request_last(rq);
 		ret = 0;
 	}
-	spin_unlock_irqrestore(&ide_lock, flags);
 	return ret;
 }
 
+/**
+ *	ide_end_request		-	complete an IDE I/O
+ *	@drive: IDE device for the I/O
+ *	@uptodate:
+ *	@nr_sectors: number of sectors completed
+ *
+ *	This is our end_request wrapper function. We complete the I/O
+ *	update random number input and dequeue the request, which if
+ *	it was tagged may be out of order.
+ */
+
+int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
+{
+	struct request *rq;
+	unsigned long flags;
+	int ret = 1;
+
+	spin_lock_irqsave(&ide_lock, flags);
+	rq = HWGROUP(drive)->rq;
+
+	if (!nr_sectors)
+		nr_sectors = rq->hard_cur_sectors;
+
+	if (!blk_barrier_rq(rq) || !drive->wcache)
+		ret = __ide_end_request(drive, rq, uptodate, nr_sectors);
+	else {
+		struct request *flush_rq = &HWGROUP(drive)->wrq;
+
+		flush_rq->nr_sectors -= nr_sectors;
+		if (!flush_rq->nr_sectors) {
+			ide_queue_flush_cmd(drive, rq, 1);
+			ret = 0;
+		}
+	}
+
+	spin_unlock_irqrestore(&ide_lock, flags);
+	return ret;
+}
 EXPORT_SYMBOL(ide_end_request);
 
 /**
@@ -137,6 +217,113 @@ static void ide_complete_pm_request (ide
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
 
+/*
+ * FIXME: probably move this somewhere else, name is bad too :)
+ */
+u64 ide_get_error_location(ide_drive_t *drive, char *args)
+{
+	u32 high, low;
+	u8 hcyl, lcyl, sect;
+	u64 sector;
+
+	high = 0;
+	hcyl = args[5];
+	lcyl = args[4];
+	sect = args[3];
+
+	if (ide_id_has_flush_cache_ext(drive->id)) {
+		low = (hcyl << 16) | (lcyl << 8) | sect;
+		HWIF(drive)->OUTB(drive->ctl|0x80, IDE_CONTROL_REG);
+		high = ide_read_24(drive);
+	} else {
+		u8 cur = HWIF(drive)->INB(IDE_SELECT_REG);
+		if (cur & 0x40)
+			low = (hcyl << 16) | (lcyl << 8) | sect;
+		else {
+			low = hcyl * drive->head * drive->sect;
+			low += lcyl * drive->sect;
+			low += sect - 1;
+		}
+	}
+
+	sector = ((u64) high << 24) | low;
+	return sector;
+}
+EXPORT_SYMBOL(ide_get_error_location);
+
+static void ide_complete_barrier(ide_drive_t *drive, struct request *rq,
+				 int error)
+{
+	struct request *real_rq = rq->special;
+	int good_sectors, bad_sectors;
+	sector_t sector;
+
+	if (!error) {
+		if (blk_barrier_postflush(rq)) {
+			/*
+			 * this completes the barrier write
+			 */
+			__ide_end_request(drive, real_rq, 1, real_rq->hard_nr_sectors);
+			drive->doing_barrier = 0;
+		} else {
+			/*
+			 * just indicate that we did the pre flush
+			 */
+			real_rq->flags |= REQ_BAR_PREFLUSH;
+			elv_requeue_request(drive->queue, real_rq);
+		}
+		/*
+		 * all is fine, return
+		 */
+		return;
+	}
+
+	/*
+	 * we need to end real_rq, but it's not on the queue currently.
+	 * put it back on the queue, so we don't have to special case
+	 * anything else for completing it
+	 */
+	if (!blk_barrier_postflush(rq))
+		elv_requeue_request(drive->queue, real_rq);
+
+	/*
+	 * drive aborted flush command, assume FLUSH_CACHE_* doesn't
+	 * work and disable barrier support
+	 */
+	if (error & ABRT_ERR) {
+		printk(KERN_ERR "%s: barrier support doesn't work\n", drive->name);
+		__ide_end_request(drive, real_rq, -EOPNOTSUPP, real_rq->hard_nr_sectors);
+		blk_queue_ordered(drive->queue, 0);
+		blk_queue_issue_flush_fn(drive->queue, NULL);
+	} else {
+		/*
+		 * find out what part of the request failed
+		 */
+		good_sectors = 0;
+		if (blk_barrier_postflush(rq)) {
+			sector = ide_get_error_location(drive, rq->buffer);
+
+			if ((sector >= real_rq->hard_sector) &&
+			    (sector < real_rq->hard_sector + real_rq->hard_nr_sectors))
+				good_sectors = sector - real_rq->hard_sector;
+		} else
+			sector = real_rq->hard_sector;
+
+		bad_sectors = real_rq->hard_nr_sectors - good_sectors;
+		if (good_sectors)
+			__ide_end_request(drive, real_rq, 1, good_sectors);
+		if (bad_sectors)
+			__ide_end_request(drive, real_rq, 0, bad_sectors);
+
+		printk(KERN_ERR "%s: failed barrier write: "
+				"sector=%Lx(good=%d/bad=%d)\n",
+				drive->name, (unsigned long long)sector,
+				good_sectors, bad_sectors);
+	}
+
+	drive->doing_barrier = 0;
+}
+
 /**
  *	ide_end_drive_cmd	-	end an explicit drive command
  *	@drive: command 
@@ -226,6 +413,10 @@ void ide_end_drive_cmd (ide_drive_t *dri
 
 	spin_lock_irqsave(&ide_lock, flags);
 	blkdev_dequeue_request(rq);
+
+	if (blk_barrier_preflush(rq) || blk_barrier_postflush(rq))
+		ide_complete_barrier(drive, rq, err);
+
 	HWGROUP(drive)->rq = NULL;
 	end_that_request_last(rq);
 	spin_unlock_irqrestore(&ide_lock, flags);
@@ -712,6 +903,22 @@ static inline ide_drive_t *choose_drive 
 repeat:	
 	best = NULL;
 	drive = hwgroup->drive;
+
+	/*
+	 * drive is doing pre-flush, ordered write, post-flush sequence. even
+	 * though that is 3 requests, it must be seen as a single transaction.
+	 * we must not preempt this drive until that is complete
+	 */
+	if (drive->doing_barrier) {
+		/*
+		 * small race where queue could get replugged during
+		 * the 3-request flush cycle, just yank the plug since
+		 * we want it to finish asap
+		 */
+		blk_remove_plug(drive->queue);
+		return drive;
+	}
+
 	do {
 		if ((!drive->sleep || time_after_eq(jiffies, drive->sleep))
 		    && !elv_queue_empty(drive->queue)) {
@@ -868,6 +1075,13 @@ void ide_do_request (ide_hwgroup_t *hwgr
 		}
 
 		/*
+		 * if rq is a barrier write, issue pre cache flush if not
+		 * already done
+		 */
+		if (blk_barrier_rq(rq) && !blk_barrier_preflush(rq))
+			rq = ide_queue_flush_cmd(drive, rq, 0);
+
+		/*
 		 * Sanity: don't accept a request that isn't a PM request
 		 * if we are currently power managed. This is very important as
 		 * blk_stop_queue() doesn't prevent the elv_next_request()
@@ -917,7 +1131,9 @@ EXPORT_SYMBOL(ide_do_request);
  */
 void do_ide_request(request_queue_t *q)
 {
-	ide_do_request(q->queuedata, IDE_NO_IRQ);
+	ide_drive_t *drive = q->queuedata;
+
+	ide_do_request(HWGROUP(drive), IDE_NO_IRQ);
 }
 
 /*
@@ -1286,6 +1502,7 @@ void ide_init_drive_cmd (struct request 
 {
 	memset(rq, 0, sizeof(*rq));
 	rq->flags = REQ_DRIVE_CMD;
+	rq->ref_count = 1;
 }
 
 EXPORT_SYMBOL(ide_init_drive_cmd);
Index: linux-2.6.8.1-ck9/drivers/ide/ide-probe.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/ide/ide-probe.c	2004-06-16 17:35:36.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/ide/ide-probe.c	2004-10-02 11:38:02.993001260 +1000
@@ -893,7 +893,7 @@ static int ide_init_queue(ide_drive_t *d
 	if (!q)
 		return 1;
 
-	q->queuedata = HWGROUP(drive);
+	q->queuedata = drive;
 	blk_queue_segment_boundary(q, 0xffff);
 
 	if (!hwif->rqsize)
Index: linux-2.6.8.1-ck9/drivers/md/dm.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/md/dm.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/md/dm.c	2004-10-02 11:38:02.994001104 +1000
@@ -597,6 +597,21 @@ static int dm_request(request_queue_t *q
 	return 0;
 }
 
+static int dm_flush_all(request_queue_t *q, struct gendisk *disk,
+			sector_t *error_sector)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+	int ret = -ENXIO;
+
+	if (map) {
+		ret = dm_table_flush_all(md->map);
+		dm_table_put(map);
+	}
+
+	return ret;
+}
+
 static void dm_unplug_all(request_queue_t *q)
 {
 	struct mapped_device *md = q->queuedata;
@@ -764,6 +779,7 @@ static struct mapped_device *alloc_dev(u
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
 	md->queue->unplug_fn = dm_unplug_all;
+	md->queue->issue_flush_fn = dm_flush_all;
 
 	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
 				     mempool_free_slab, _io_cache);
Index: linux-2.6.8.1-ck9/drivers/md/dm.h
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/md/dm.h	2004-06-16 17:35:36.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/md/dm.h	2004-10-02 11:38:02.994001104 +1000
@@ -113,6 +113,7 @@ void dm_table_suspend_targets(struct dm_
 void dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 void dm_table_unplug_all(struct dm_table *t);
+int dm_table_flush_all(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
Index: linux-2.6.8.1-ck9/drivers/md/dm-table.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/md/dm-table.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/md/dm-table.c	2004-10-02 11:38:02.995000948 +1000
@@ -900,6 +900,28 @@ void dm_table_unplug_all(struct dm_table
 	}
 }
 
+int dm_table_flush_all(struct dm_table *t)
+{
+	struct list_head *d, *devices = dm_table_get_devices(t);
+	int ret = 0;
+
+	for (d = devices->next; d != devices; d = d->next) {
+		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+		request_queue_t *q = bdev_get_queue(dd->bdev);
+		int err;
+
+		if (!q->issue_flush_fn)
+			err = -EOPNOTSUPP;
+		else
+			err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL);
+
+		if (!ret)
+			ret = err;
+	}
+
+	return ret;
+}
+
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
@@ -908,3 +930,4 @@ EXPORT_SYMBOL(dm_table_get_mode);
 EXPORT_SYMBOL(dm_table_put);
 EXPORT_SYMBOL(dm_table_get);
 EXPORT_SYMBOL(dm_table_unplug_all);
+EXPORT_SYMBOL(dm_table_flush_all);
Index: linux-2.6.8.1-ck9/drivers/md/linear.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/md/linear.c	2004-05-23 12:54:50.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/md/linear.c	2004-10-02 11:38:02.996000792 +1000
@@ -47,7 +47,6 @@ static inline dev_info_t *which_dev(mdde
 		return hash->dev0;
 }
 
-
 /**
  *	linear_mergeable_bvec -- tell bio layer if a two requests can be merged
  *	@q: request queue
@@ -93,6 +92,27 @@ static void linear_unplug(request_queue_
 	}
 }
 
+static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
+			      sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	linear_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i=0; i < mddev->raid_disks; i++) {
+		struct block_device *bdev = conf->disks[i].rdev->bdev;
+		request_queue_t *r_queue = bdev_get_queue(bdev);
+
+		if (!r_queue->issue_flush_fn) {
+			ret = -EOPNOTSUPP;
+			break;
+		}
+		ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+		if (ret)
+			break;
+	}
+	return ret;
+}
 
 static int linear_run (mddev_t *mddev)
 {
@@ -200,6 +220,7 @@ static int linear_run (mddev_t *mddev)
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
+	mddev->queue->issue_flush_fn = linear_issue_flush;
 	return 0;
 
 out:
Index: linux-2.6.8.1-ck9/drivers/md/md.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/md/md.c	2004-06-16 17:35:36.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/md/md.c	2004-10-02 11:38:02.998000479 +1000
@@ -154,6 +154,39 @@ static spinlock_t all_mddevs_lock = SPIN
 		tmp = tmp->next;})					\
 		)
 
+int md_flush_mddev(mddev_t *mddev, sector_t *error_sector)
+{
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+	int ret = 0;
+
+	/*
+	 * this list iteration is done without any locking in md?!
+	 */
+	ITERATE_RDEV(mddev, rdev, tmp) {
+		request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+		int err;
+
+		if (!r_queue->issue_flush_fn)
+			err = -EOPNOTSUPP;
+		else
+			err = r_queue->issue_flush_fn(r_queue, rdev->bdev->bd_disk, error_sector);
+
+		if (!ret)
+			ret = err;
+	}
+
+	return ret;
+}
+
+static int md_flush_all(request_queue_t *q, struct gendisk *disk,
+			 sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+
+	return md_flush_mddev(mddev, error_sector);
+}
+
 static int md_fail_request (request_queue_t *q, struct bio *bio)
 {
 	bio_io_error(bio, bio->bi_size);
@@ -1645,6 +1678,7 @@ static int do_md_run(mddev_t * mddev)
 	 */
 	mddev->queue->queuedata = mddev;
 	mddev->queue->make_request_fn = mddev->pers->make_request;
+	mddev->queue->issue_flush_fn = md_flush_all;
 
 	mddev->changed = 1;
 	return 0;
Index: linux-2.6.8.1-ck9/drivers/md/multipath.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/md/multipath.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/md/multipath.c	2004-10-02 11:38:02.999000323 +1000
@@ -120,7 +120,7 @@ int multipath_end_request(struct bio *bi
 
 	if (uptodate)
 		multipath_end_bh_io(mp_bh, uptodate);
-	else if ((bio->bi_rw & (1 << BIO_RW_AHEAD)) == 0) {
+	else if (!bio_rw_ahead(bio)) {
 		/*
 		 * oops, IO error:
 		 */
@@ -217,6 +217,31 @@ static void multipath_status (struct seq
 	seq_printf (seq, "]");
 }
 
+static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
+				 sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->multipaths[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue = bdev_get_queue(bdev);
+
+			if (!r_queue->issue_flush_fn) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+
+			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
 
 /*
  * Careful, this can execute in IRQ contexts as well!
@@ -435,6 +460,8 @@ static int multipath_run (mddev_t *mddev
 
 	mddev->queue->unplug_fn = multipath_unplug;
 
+	mddev->queue->issue_flush_fn = multipath_issue_flush;
+
 	conf->working_disks = 0;
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		disk_idx = rdev->raid_disk;
Index: linux-2.6.8.1-ck9/drivers/md/raid0.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/md/raid0.c	2004-05-23 12:54:50.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/md/raid0.c	2004-10-02 11:38:03.000000167 +1000
@@ -40,6 +40,31 @@ static void raid0_unplug(request_queue_t
 	}
 }
 
+static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+	int i, ret = 0;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		struct block_device *bdev = devlist[i]->bdev;
+		request_queue_t *r_queue = bdev_get_queue(bdev);
+
+		if (!r_queue->issue_flush_fn) {
+			ret = -EOPNOTSUPP;
+			break;
+		}
+
+		ret =r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+
 static int create_strip_zones (mddev_t *mddev)
 {
 	int i, c, j;
@@ -219,6 +244,8 @@ static int create_strip_zones (mddev_t *
 
 	mddev->queue->unplug_fn = raid0_unplug;
 
+	mddev->queue->issue_flush_fn = raid0_issue_flush;
+
 	printk("raid0: done.\n");
 	return 0;
  abort:
Index: linux-2.6.8.1-ck9/drivers/md/raid1.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/md/raid1.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/md/raid1.c	2004-10-02 11:38:03.001000010 +1000
@@ -481,6 +481,32 @@ static void raid1_unplug(request_queue_t
 	unplug_slaves(q->queuedata);
 }
 
+static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	conf_t *conf = mddev_to_conf(mddev);
+	unsigned long flags;
+	int i, ret = 0;
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue = bdev_get_queue(bdev);
+
+			if (r_queue->issue_flush_fn) {
+				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+				if (ret)
+					break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	return ret;
+}
+
 /*
  * Throttle resync depth, so that we can both get proper overlapping of
  * requests, but are still able to handle normal requests quickly.
@@ -1168,6 +1194,7 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->unplug_fn = raid1_unplug;
 
+	mddev->queue->issue_flush_fn = raid1_issue_flush;
 
 	ITERATE_RDEV(mddev, rdev, tmp) {
 		disk_idx = rdev->raid_disk;
Index: linux-2.6.8.1-ck9/drivers/md/raid5.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/md/raid5.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/md/raid5.c	2004-10-02 11:38:03.001999854 +1000
@@ -1339,6 +1339,39 @@ static void raid5_unplug_device(request_
 	unplug_slaves(mddev);
 }
 
+static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->disks[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue;
+
+			if (!bdev)
+				continue;
+
+			r_queue = bdev_get_queue(bdev);
+			if (!r_queue)
+				continue;
+
+			if (!r_queue->issue_flush_fn) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+
+			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
+
 static inline void raid5_plug_device(raid5_conf_t *conf)
 {
 	spin_lock_irq(&conf->device_lock);
@@ -1545,6 +1578,7 @@ static int run (mddev_t *mddev)
 	atomic_set(&conf->preread_active_stripes, 0);
 
 	mddev->queue->unplug_fn = raid5_unplug_device;
+	mddev->queue->issue_flush_fn = raid5_issue_flush;
 
 	PRINTK("raid5: run(%s) called.\n", mdname(mddev));
 
Index: linux-2.6.8.1-ck9/drivers/md/raid6main.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/md/raid6main.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/md/raid6main.c	2004-10-02 11:38:03.003999542 +1000
@@ -1501,6 +1501,39 @@ static void raid6_unplug_device(request_
 	unplug_slaves(mddev);
 }
 
+static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	raid6_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->disks[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue;
+
+			if (!bdev)
+				continue;
+
+			r_queue = bdev_get_queue(bdev);
+			if (!r_queue)
+				continue;
+
+			if (!r_queue->issue_flush_fn) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+
+			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
+
 static inline void raid6_plug_device(raid6_conf_t *conf)
 {
 	spin_lock_irq(&conf->device_lock);
@@ -1708,6 +1741,7 @@ static int run (mddev_t *mddev)
 	atomic_set(&conf->preread_active_stripes, 0);
 
 	mddev->queue->unplug_fn = raid6_unplug_device;
+	mddev->queue->issue_flush_fn = raid6_issue_flush;
 
 	PRINTK("raid6: run(%s) called.\n", mdname(mddev));
 
Index: linux-2.6.8.1-ck9/drivers/scsi/scsi_lib.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/scsi/scsi_lib.c	2004-08-15 14:08:08.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/scsi/scsi_lib.c	2004-10-02 11:38:03.014997823 +1000
@@ -954,6 +954,22 @@ static int scsi_init_io(struct scsi_cmnd
 	return BLKPREP_KILL;
 }
 
+static int scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
+			       sector_t *error_sector)
+{
+	struct scsi_device *sdev = q->queuedata;
+	struct scsi_driver *drv;
+
+	if (sdev->sdev_state != SDEV_RUNNING)
+		return -ENXIO;
+
+	drv = *(struct scsi_driver **) disk->private_data;
+	if (drv->issue_flush)
+		return drv->issue_flush(&sdev->sdev_gendev, error_sector);
+
+	return -EOPNOTSUPP;
+}
+
 static int scsi_prep_fn(struct request_queue *q, struct request *req)
 {
 	struct scsi_device *sdev = q->queuedata;
@@ -1335,7 +1351,8 @@ struct request_queue *scsi_alloc_queue(s
 	blk_queue_max_sectors(q, shost->max_sectors);
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
 	blk_queue_segment_boundary(q, shost->dma_boundary);
- 
+	blk_queue_issue_flush_fn(q, scsi_issue_flush_fn);
+
 	if (!shost->use_clustering)
 		clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 	return q;
Index: linux-2.6.8.1-ck9/drivers/scsi/sd.c
===================================================================
--- linux-2.6.8.1-ck9.orig/drivers/scsi/sd.c	2004-08-15 14:08:08.000000000 +1000
+++ linux-2.6.8.1-ck9/drivers/scsi/sd.c	2004-10-02 11:38:03.016997510 +1000
@@ -113,6 +113,7 @@ static int sd_remove(struct device *);
 static void sd_shutdown(struct device *dev);
 static void sd_rescan(struct device *);
 static int sd_init_command(struct scsi_cmnd *);
+static int sd_issue_flush(struct device *, sector_t *);
 static void sd_read_capacity(struct scsi_disk *sdkp, char *diskname,
 		 struct scsi_request *SRpnt, unsigned char *buffer);
 
@@ -126,6 +127,7 @@ static struct scsi_driver sd_template = 
 	},
 	.rescan			= sd_rescan,
 	.init_command		= sd_init_command,
+	.issue_flush		= sd_issue_flush,
 };
 
 /* Device no to disk mapping:
@@ -676,6 +678,62 @@ not_present:
 	return 1;
 }
 
+static int sd_sync_cache(struct scsi_device *sdp)
+{
+	struct scsi_request *sreq;
+	int retries, res;
+
+	if (!scsi_device_online(sdp))
+		return -ENODEV;
+
+	sreq = scsi_allocate_request(sdp, GFP_KERNEL);
+	if (!sreq) {
+		printk("FAILED\n  No memory for request\n");
+		return -ENOMEM;
+	}
+
+	sreq->sr_data_direction = DMA_NONE;
+	for (retries = 3; retries > 0; --retries) {
+		unsigned char cmd[10] = { 0 };
+
+		cmd[0] = SYNCHRONIZE_CACHE;
+		/*
+		 * Leave the rest of the command zero to indicate
+		 * flush everything.
+		 */
+		scsi_wait_req(sreq, cmd, NULL, 0, SD_TIMEOUT, SD_MAX_RETRIES);
+		if (sreq->sr_result == 0)
+			break;
+	}
+
+	res = sreq->sr_result;
+	if (res) {
+		printk(KERN_WARNING "FAILED\n  status = %x, message = %02x, "
+				    "host = %d, driver = %02x\n  ",
+				    status_byte(res), msg_byte(res),
+				    host_byte(res), driver_byte(res));
+			if (driver_byte(res) & DRIVER_SENSE)
+				scsi_print_req_sense("sd", sreq);
+	}
+
+	scsi_release_request(sreq);
+	return res;
+}
+
+static int sd_issue_flush(struct device *dev, sector_t *error_sector)
+{
+	struct scsi_device *sdp = to_scsi_device(dev);
+	struct scsi_disk *sdkp = dev_get_drvdata(dev);
+
+	if (!sdkp)
+               return -ENODEV;
+
+	if (!sdkp->WCE)
+		return 0;
+
+	return sd_sync_cache(sdp);
+}
+
 static void sd_rescan(struct device *dev)
 {
 	struct scsi_disk *sdkp = dev_get_drvdata(dev);
@@ -1503,52 +1561,17 @@ static void scsi_disk_release(struct kre
 static void sd_shutdown(struct device *dev)
 {
 	struct scsi_device *sdp = to_scsi_device(dev);
-	struct scsi_disk *sdkp;
-	struct scsi_request *sreq;
-	int retries, res;
+	struct scsi_disk *sdkp = dev_get_drvdata(dev);
 
-	sdkp = dev_get_drvdata(dev);
 	if (!sdkp)
-               return;         /* this can happen */
+		return;         /* this can happen */
 
-	if (!scsi_device_online(sdp) || !sdkp->WCE)
+	if (!sdkp->WCE)
 		return;
 
-	printk(KERN_NOTICE "Synchronizing SCSI cache for disk %s: ",
+	printk(KERN_NOTICE "Synchronizing SCSI cache for disk %s: \n",
 			sdkp->disk->disk_name);
-
-	sreq = scsi_allocate_request(sdp, GFP_KERNEL);
-	if (!sreq) {
-		printk("FAILED\n  No memory for request\n");
-		return;
-	}
-
-	sreq->sr_data_direction = DMA_NONE;
-	for (retries = 3; retries > 0; --retries) {
-		unsigned char cmd[10] = { 0 };
-
-		cmd[0] = SYNCHRONIZE_CACHE;
-		/*
-		 * Leave the rest of the command zero to indicate
-		 * flush everything.
-		 */
-		scsi_wait_req(sreq, cmd, NULL, 0, SD_TIMEOUT, SD_MAX_RETRIES);
-		if (sreq->sr_result == 0)
-			break;
-	}
-
-	res = sreq->sr_result;
-	if (res) {
-		printk(KERN_WARNING "FAILED\n  status = %x, message = %02x, "
-				    "host = %d, driver = %02x\n  ",
-				    status_byte(res), msg_byte(res),
-				    host_byte(res), driver_byte(res));
-			if (driver_byte(res) & DRIVER_SENSE)
-				scsi_print_req_sense("sd", sreq);
-	}
-	
-	scsi_release_request(sreq);
-	printk("\n");
+	sd_sync_cache(sdp);
 }	
 
 /**
Index: linux-2.6.8.1-ck9/fs/buffer.c
===================================================================
--- linux-2.6.8.1-ck9.orig/fs/buffer.c	2004-08-15 14:08:13.000000000 +1000
+++ linux-2.6.8.1-ck9/fs/buffer.c	2004-10-02 11:38:03.032995010 +1000
@@ -213,7 +213,7 @@ void end_buffer_write_sync(struct buffer
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (printk_ratelimit()) {
+		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
@@ -2756,21 +2756,33 @@ static int end_bio_bh_io_sync(struct bio
 	if (bio->bi_size)
 		return 1;
 
+	if (err == -EOPNOTSUPP) {
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		set_bit(BH_Eopnotsupp, &bh->b_state);
+	}
+
 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
 	bio_put(bio);
 	return 0;
 }
 
-void submit_bh(int rw, struct buffer_head * bh)
+int submit_bh(int rw, struct buffer_head * bh)
 {
 	struct bio *bio;
+	int ret = 0;
 
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
 	BUG_ON(!bh->b_end_io);
 
-	/* Only clear out a write error when rewriting */
-	if (test_set_buffer_req(bh) && rw == WRITE)
+	if (buffer_ordered(bh) && (rw == WRITE))
+		rw = WRITE_BARRIER;
+
+	/*
+	 * Only clear out a write error when rewriting, should this
+	 * include WRITE_SYNC as well?
+	 */
+	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
 		clear_buffer_write_io_error(bh);
 
 	/*
@@ -2792,7 +2804,14 @@ void submit_bh(int rw, struct buffer_hea
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
 
+	bio_get(bio);
 	submit_bio(rw, bio);
+
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+
+	bio_put(bio);
+	return ret;
 }
 
 /**
@@ -2851,20 +2870,30 @@ void ll_rw_block(int rw, int nr, struct 
 
 /*
  * For a data-integrity writeout, we need to wait upon any in-progress I/O
- * and then start new I/O and then wait upon it.
+ * and then start new I/O and then wait upon it.  The caller must have a ref on
+ * the buffer_head.
  */
-void sync_dirty_buffer(struct buffer_head *bh)
+int sync_dirty_buffer(struct buffer_head *bh)
 {
+	int ret = 0;
+
 	WARN_ON(atomic_read(&bh->b_count) < 1);
 	lock_buffer(bh);
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		submit_bh(WRITE, bh);
+		ret = submit_bh(WRITE, bh);
 		wait_on_buffer(bh);
+		if (buffer_eopnotsupp(bh)) {
+			clear_buffer_eopnotsupp(bh);
+			ret = -EOPNOTSUPP;
+		}
+		if (!ret && !buffer_uptodate(bh))
+			ret = -EIO;
 	} else {
 		unlock_buffer(bh);
 	}
+	return ret;
 }
 
 /*
Index: linux-2.6.8.1-ck9/fs/ext3/super.c
===================================================================
--- linux-2.6.8.1-ck9.orig/fs/ext3/super.c	2004-08-15 14:08:15.000000000 +1000
+++ linux-2.6.8.1-ck9/fs/ext3/super.c	2004-10-02 11:38:03.040993760 +1000
@@ -587,7 +587,7 @@ enum {
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
-	Opt_ignore, Opt_err,
+	Opt_ignore, Opt_barrier, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -632,6 +632,7 @@ static match_table_t tokens = {
 	{Opt_ignore, "noquota"},
 	{Opt_ignore, "quota"},
 	{Opt_ignore, "usrquota"},
+	{Opt_barrier, "barrier=%u"},
 	{Opt_err, NULL}
 };
 
@@ -897,6 +898,14 @@ clear_qf_name:
 		case Opt_abort:
 			set_opt(sbi->s_mount_opt, ABORT);
 			break;
+		case Opt_barrier:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option)
+				set_opt(sbi->s_mount_opt, BARRIER);
+			else
+				clear_opt(sbi->s_mount_opt, BARRIER);
+			break;
 		case Opt_ignore:
 			break;
 		default:
@@ -1599,16 +1608,23 @@ out_fail:
  * initial mount, once the journal has been initialised but before we've
  * done any recovery; and again on any subsequent remount. 
  */
-static void ext3_init_journal_params(struct ext3_sb_info *sbi, 
-				     journal_t *journal)
+static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
 {
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
 	if (sbi->s_commit_interval)
 		journal->j_commit_interval = sbi->s_commit_interval;
 	/* We could also set up an ext3-specific default for the commit
 	 * interval here, but for now we'll just fall back to the jbd
 	 * default. */
-}
 
+	spin_lock(&journal->j_state_lock);
+	if (test_opt(sb, BARRIER))
+		journal->j_flags |= JFS_BARRIER;
+	else
+		journal->j_flags &= ~JFS_BARRIER;
+	spin_unlock(&journal->j_state_lock);
+}
 
 static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
 {
@@ -1646,7 +1662,7 @@ static journal_t *ext3_get_journal(struc
 		return NULL;
 	}
 	journal->j_private = sb;
-	ext3_init_journal_params(EXT3_SB(sb), journal);
+	ext3_init_journal_params(sb, journal);
 	return journal;
 }
 
@@ -1731,7 +1747,7 @@ static journal_t *ext3_get_dev_journal(s
 		goto out_journal;
 	}
 	EXT3_SB(sb)->journal_bdev = bdev;
-	ext3_init_journal_params(EXT3_SB(sb), journal);
+	ext3_init_journal_params(sb, journal);
 	return journal;
 out_journal:
 	journal_destroy(journal);
@@ -2028,7 +2044,7 @@ int ext3_remount (struct super_block * s
 
 	es = sbi->s_es;
 
-	ext3_init_journal_params(sbi, sbi->s_journal);
+	ext3_init_journal_params(sb, sbi->s_journal);
 
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 		if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
Index: linux-2.6.8.1-ck9/fs/jbd/commit.c
===================================================================
--- linux-2.6.8.1-ck9.orig/fs/jbd/commit.c	2004-10-02 11:37:43.120106887 +1000
+++ linux-2.6.8.1-ck9/fs/jbd/commit.c	2004-10-02 11:38:03.041993604 +1000
@@ -645,10 +645,38 @@ wait_for_iobuf:
 	JBUFFER_TRACE(descriptor, "write commit block");
 	{
 		struct buffer_head *bh = jh2bh(descriptor);
+		int ret;
+		int barrier_done = 0;
 
 		set_buffer_dirty(bh);
-		sync_dirty_buffer(bh);
-		if (unlikely(!buffer_uptodate(bh)))
+		if (journal->j_flags & JFS_BARRIER) {
+			set_buffer_ordered(bh);
+			barrier_done = 1;
+		}
+		ret = sync_dirty_buffer(bh);
+		/* is it possible for another commit to fail at roughly
+		 * the same time as this one?  If so, we don't want to
+		 * trust the barrier flag in the super, but instead want
+		 * to remember if we sent a barrier request
+		 */
+		if (ret == -EOPNOTSUPP && barrier_done) {
+			char b[BDEVNAME_SIZE];
+
+			printk(KERN_WARNING
+				"JBD: barrier-based sync failed on %s - "
+				"disabling barriers\n",
+				bdevname(journal->j_dev, b));
+			spin_lock(&journal->j_state_lock);
+			journal->j_flags &= ~JFS_BARRIER;
+			spin_unlock(&journal->j_state_lock);
+
+			/* And try again, without the barrier */
+			clear_buffer_ordered(bh);
+			set_buffer_uptodate(bh);
+			set_buffer_dirty(bh);
+			ret = sync_dirty_buffer(bh);
+		}
+		if (unlikely(ret == -EIO))
 			err = -EIO;
 		put_bh(bh);		/* One for getblk() */
 		journal_put_journal_head(descriptor);
Index: linux-2.6.8.1-ck9/fs/reiserfs/file.c
===================================================================
--- linux-2.6.8.1-ck9.orig/fs/reiserfs/file.c	2004-08-15 14:08:17.000000000 +1000
+++ linux-2.6.8.1-ck9/fs/reiserfs/file.c	2004-10-02 11:38:03.056991260 +1000
@@ -89,15 +89,16 @@ static int reiserfs_sync_file(
 			      ) {
   struct inode * p_s_inode = p_s_dentry->d_inode;
   int n_err;
-
-  reiserfs_write_lock(p_s_inode->i_sb);
+  int barrier_done;
 
   if (!S_ISREG(p_s_inode->i_mode))
       BUG ();
-
   n_err = sync_mapping_buffers(p_s_inode->i_mapping) ;
-  reiserfs_commit_for_inode(p_s_inode) ;
+  reiserfs_write_lock(p_s_inode->i_sb);
+  barrier_done = reiserfs_commit_for_inode(p_s_inode);
   reiserfs_write_unlock(p_s_inode->i_sb);
+  if (barrier_done != 1)
+      blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
   return ( n_err < 0 ) ? -EIO : 0;
 }
 
Index: linux-2.6.8.1-ck9/fs/reiserfs/journal.c
===================================================================
--- linux-2.6.8.1-ck9.orig/fs/reiserfs/journal.c	2004-08-15 14:08:17.000000000 +1000
+++ linux-2.6.8.1-ck9/fs/reiserfs/journal.c	2004-10-02 11:38:03.071988917 +1000
@@ -127,6 +127,12 @@ static int reiserfs_clean_and_file_buffe
   return 0 ;
 }
 
+static void disable_barrier(struct super_block *s)
+{
+    REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
+    printk("reiserfs: disabling flush barriers on %s\n", reiserfs_bdevname(s));
+}
+
 static struct reiserfs_bitmap_node *
 allocate_bitmap_node(struct super_block *p_s_sb) {
   struct reiserfs_bitmap_node *bn ;
@@ -640,6 +646,26 @@ static void submit_ordered_buffer(struct
     submit_bh(WRITE, bh) ;
 }
 
+static int submit_barrier_buffer(struct buffer_head *bh) {
+    get_bh(bh) ;
+    bh->b_end_io = reiserfs_end_ordered_io;
+    clear_buffer_dirty(bh) ;
+    if (!buffer_uptodate(bh))
+        BUG();
+    return submit_bh(WRITE_BARRIER, bh) ;
+}
+
+static void check_barrier_completion(struct super_block *s,
+                                     struct buffer_head *bh) {
+    if (buffer_eopnotsupp(bh)) {
+	clear_buffer_eopnotsupp(bh);
+	disable_barrier(s);
+	set_buffer_uptodate(bh);
+	set_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+    }
+}
+
 #define CHUNK_SIZE 32
 struct buffer_chunk {
     struct buffer_head *bh[CHUNK_SIZE];
@@ -909,6 +935,7 @@ static int flush_commit_list(struct supe
   int bn ;
   struct buffer_head *tbh = NULL ;
   unsigned long trans_id = jl->j_trans_id;
+  int barrier = 0;
 
   reiserfs_check_lock_depth(s, "flush_commit_list") ;
 
@@ -973,7 +1000,20 @@ static int flush_commit_list(struct supe
   }
   atomic_dec(&SB_JOURNAL(s)->j_async_throttle);
 
-  /* wait on everything written so far before writing the commit */
+  /* wait on everything written so far before writing the commit
+   * if we are in barrier mode, send the commit down now
+   */
+  barrier = reiserfs_barrier_flush(s);
+  if (barrier) {
+      int ret;
+      lock_buffer(jl->j_commit_bh);
+      ret = submit_barrier_buffer(jl->j_commit_bh);
+      if (ret == -EOPNOTSUPP) {
+	  set_buffer_uptodate(jl->j_commit_bh);
+          disable_barrier(s);
+	  barrier = 0;
+      }
+  }
   for (i = 0 ;  i < (jl->j_len + 1) ; i++) {
     bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
 	 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
@@ -995,10 +1035,15 @@ static int flush_commit_list(struct supe
   if (atomic_read(&(jl->j_commit_left)) != 1)
     BUG();
 
-  if (buffer_dirty(jl->j_commit_bh))
-    BUG();
-  mark_buffer_dirty(jl->j_commit_bh) ;
-  sync_dirty_buffer(jl->j_commit_bh) ;
+  if (!barrier) {
+      if (buffer_dirty(jl->j_commit_bh))
+	BUG();
+      mark_buffer_dirty(jl->j_commit_bh) ;
+      sync_dirty_buffer(jl->j_commit_bh) ;
+  } else
+      wait_on_buffer(jl->j_commit_bh);
+
+  check_barrier_completion(s, jl->j_commit_bh);
   if (!buffer_uptodate(jl->j_commit_bh)) {
     reiserfs_panic(s, "journal-615: buffer write failed\n") ;
   }
@@ -1098,8 +1143,23 @@ static int _update_journal_header_block(
     jh->j_last_flush_trans_id = cpu_to_le32(trans_id) ;
     jh->j_first_unflushed_offset = cpu_to_le32(offset) ;
     jh->j_mount_id = cpu_to_le32(SB_JOURNAL(p_s_sb)->j_mount_id) ;
-    set_buffer_dirty(SB_JOURNAL(p_s_sb)->j_header_bh) ;
-    sync_dirty_buffer(SB_JOURNAL(p_s_sb)->j_header_bh) ;
+
+    if (reiserfs_barrier_flush(p_s_sb)) {
+	int ret;
+	lock_buffer(SB_JOURNAL(p_s_sb)->j_header_bh);
+	ret = submit_barrier_buffer(SB_JOURNAL(p_s_sb)->j_header_bh);
+	if (ret == -EOPNOTSUPP) {
+	    set_buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh);
+	    disable_barrier(p_s_sb);
+	    goto sync;
+	}
+	wait_on_buffer(SB_JOURNAL(p_s_sb)->j_header_bh);
+	check_barrier_completion(p_s_sb, SB_JOURNAL(p_s_sb)->j_header_bh);
+    } else {
+sync:
+	set_buffer_dirty(SB_JOURNAL(p_s_sb)->j_header_bh) ;
+	sync_dirty_buffer(SB_JOURNAL(p_s_sb)->j_header_bh) ;
+    }
     if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) {
       reiserfs_warning (p_s_sb, "journal-837: IO error during journal replay");
       return -EIO ;
@@ -3184,11 +3244,16 @@ void reiserfs_update_inode_transaction(s
   REISERFS_I(inode)->i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
 }
 
-static void __commit_trans_jl(struct inode *inode, unsigned long id,
+/*
+ * returns -1 on error, 0 if no commits/barriers were done and 1
+ * if a transaction was actually committed and the barrier was done
+ */
+static int __commit_trans_jl(struct inode *inode, unsigned long id,
                                  struct reiserfs_journal_list *jl)
 {
     struct reiserfs_transaction_handle th ;
     struct super_block *sb = inode->i_sb ;
+    int ret = 0;
 
     /* is it from the current transaction, or from an unknown transaction? */
     if (id == SB_JOURNAL(sb)->j_trans_id) {
@@ -3210,6 +3275,7 @@ static void __commit_trans_jl(struct ino
 	}
 
 	journal_end_sync(&th, sb, 1) ;
+	ret = 1;
 
     } else {
 	/* this gets tricky, we have to make sure the journal list in
@@ -3218,13 +3284,21 @@ static void __commit_trans_jl(struct ino
 	 */
 flush_commit_only:
 	if (journal_list_still_alive(inode->i_sb, id)) {
+	    /*
+	     * we only set ret to 1 when we know for sure
+	     * the barrier hasn't been started yet on the commit
+	     * block.
+	     */
+	    if (atomic_read(&jl->j_commit_left) > 1)
+	        ret = 1;
 	    flush_commit_list(sb, jl, 1) ;
 	}
     }
     /* otherwise the list is gone, and long since committed */
+    return ret;
 }
 
-void reiserfs_commit_for_inode(struct inode *inode) {
+int reiserfs_commit_for_inode(struct inode *inode) {
     unsigned long id = REISERFS_I(inode)->i_trans_id;
     struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
 
@@ -3237,7 +3311,7 @@ void reiserfs_commit_for_inode(struct in
 	/* jl will be updated in __commit_trans_jl */
     }
 
-    __commit_trans_jl(inode, id, jl);
+   return __commit_trans_jl(inode, id, jl);
 }
 
 void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, 
Index: linux-2.6.8.1-ck9/fs/reiserfs/super.c
===================================================================
--- linux-2.6.8.1-ck9.orig/fs/reiserfs/super.c	2004-08-15 14:08:17.000000000 +1000
+++ linux-2.6.8.1-ck9/fs/reiserfs/super.c	2004-10-02 11:38:03.074988448 +1000
@@ -549,6 +549,13 @@ static const arg_desc_t logging_mode[] =
     {NULL, 0}
 };
 
+/* possible values for -o barrier= */
+static const arg_desc_t barrier_mode[] = {
+    {"none", 1<<REISERFS_BARRIER_NONE, 1<<REISERFS_BARRIER_FLUSH},
+    {"flush", 1<<REISERFS_BARRIER_FLUSH, 1<<REISERFS_BARRIER_NONE},
+    {NULL, 0}
+};
+
 /* possible values for "-o block-allocator=" and bits which are to be set in
    s_mount_opt of reiserfs specific part of in-core super block */
 static const arg_desc_t balloc[] = {
@@ -711,6 +718,7 @@ static int reiserfs_parse_options (struc
 	{"replayonly",	.setmask = 1<<REPLAYONLY},
 	{"block-allocator", .arg_required = 'a', .values = balloc},
 	{"data",	.arg_required = 'd', .values = logging_mode},
+	{"barrier",	.arg_required = 'b', .values = barrier_mode},
 	{"resize",	.arg_required = 'r', .values = NULL},
 	{"jdev",	.arg_required = 'j', .values = NULL},
 	{"nolargeio",	.arg_required = 'w', .values = NULL},
@@ -810,6 +818,23 @@ static void handle_data_mode(struct supe
     }
 }
 
+static void handle_barrier_mode(struct super_block *s, unsigned long bits) {
+    int flush = (1 << REISERFS_BARRIER_FLUSH);
+    int none = (1 << REISERFS_BARRIER_NONE);
+    int all_barrier = flush | none;
+
+    if (bits & all_barrier) {
+        REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
+	if (bits & flush) {
+	    REISERFS_SB(s)->s_mount_opt |= flush;
+	    printk("reiserfs: enabling write barrier flush mode\n");
+	} else if (bits & none) {
+	    REISERFS_SB(s)->s_mount_opt |= none;
+	    printk("reiserfs: write barriers turned off\n");
+	}
+   }
+}
+
 static void handle_attrs( struct super_block *s )
 {
 	struct reiserfs_super_block * rs;
@@ -854,6 +879,8 @@ static int reiserfs_remount (struct supe
   safe_mask |= 1 << REISERFS_ATTRS;
   safe_mask |= 1 << REISERFS_XATTRS_USER;
   safe_mask |= 1 << REISERFS_POSIXACL;
+  safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
+  safe_mask |= 1 << REISERFS_BARRIER_NONE;
 
   /* Update the bitmask, taking care to keep
    * the bits we're not allowed to change here */
@@ -900,6 +927,7 @@ static int reiserfs_remount (struct supe
     }
 
     handle_data_mode(s, mount_options);
+    handle_barrier_mode(s, mount_options);
     REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ;
     s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */
     journal_begin(&th, s, 10) ;
@@ -1413,6 +1441,9 @@ static int reiserfs_fill_super (struct s
     } else {
         reiserfs_info (s, "using writeback data mode\n");
     }
+    if (reiserfs_barrier_flush(s)) {
+    	printk("reiserfs: using flush barriers\n");
+    }
 
     // set_device_ro(s->s_dev, 1) ;
     if( journal_init(s, jdev_name, old_format, commit_max_age) ) {
Index: linux-2.6.8.1-ck9/include/linux/bio.h
===================================================================
--- linux-2.6.8.1-ck9.orig/include/linux/bio.h	2004-08-15 14:08:18.000000000 +1000
+++ linux-2.6.8.1-ck9/include/linux/bio.h	2004-10-02 11:38:03.074988448 +1000
@@ -121,6 +121,7 @@ struct bio {
 #define BIO_CLONED	4	/* doesn't own data */
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
 #define BIO_USER_MAPPED 6	/* contains user pages */
+#define BIO_EOPNOTSUPP	7	/* not supported */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
@@ -160,6 +161,8 @@ struct bio {
 #define bio_data(bio)		(page_address(bio_page((bio))) + bio_offset((bio)))
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
 #define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
+#define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
+#define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 
 /*
  * will die
Index: linux-2.6.8.1-ck9/include/linux/blkdev.h
===================================================================
--- linux-2.6.8.1-ck9.orig/include/linux/blkdev.h	2004-08-15 14:08:18.000000000 +1000
+++ linux-2.6.8.1-ck9/include/linux/blkdev.h	2004-10-02 11:38:03.075988292 +1000
@@ -195,6 +195,8 @@ enum rq_flag_bits {
 	__REQ_PM_SUSPEND,	/* suspend request */
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
+	__REQ_BAR_PREFLUSH,	/* barrier pre-flush done */
+	__REQ_BAR_POSTFLUSH,	/* barrier post-flush */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -220,6 +222,8 @@ enum rq_flag_bits {
 #define REQ_PM_SUSPEND	(1 << __REQ_PM_SUSPEND)
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
+#define REQ_BAR_PREFLUSH	(1 << __REQ_BAR_PREFLUSH)
+#define REQ_BAR_POSTFLUSH	(1 << __REQ_BAR_POSTFLUSH)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
@@ -248,6 +252,7 @@ typedef void (unplug_fn) (request_queue_
 struct bio_vec;
 typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
 typedef void (activity_fn) (void *data, int rw);
+typedef int (issue_flush_fn) (request_queue_t *, struct gendisk *, sector_t *);
 
 enum blk_queue_state {
 	Queue_down,
@@ -290,6 +295,7 @@ struct request_queue
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	activity_fn		*activity_fn;
+	issue_flush_fn		*issue_flush_fn;
 
 	/*
 	 * Auto-unplugging state
@@ -373,6 +379,7 @@ struct request_queue
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
+#define QUEUE_FLAG_ORDERED	8	/* supports ordered writes */
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
@@ -390,6 +397,10 @@ struct request_queue
 #define blk_pm_request(rq)	\
 	((rq)->flags & (REQ_PM_SUSPEND | REQ_PM_RESUME))
 
+#define blk_barrier_rq(rq)	((rq)->flags & REQ_HARDBARRIER)
+#define blk_barrier_preflush(rq)	((rq)->flags & REQ_BAR_PREFLUSH)
+#define blk_barrier_postflush(rq)	((rq)->flags & REQ_BAR_POSTFLUSH)
+
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
 #define rq_data_dir(rq)		((rq)->flags & 1)
@@ -560,6 +571,14 @@ extern void end_that_request_last(struct
 extern int process_that_request_first(struct request *, unsigned int);
 extern void end_request(struct request *req, int uptodate);
 
+/*
+ * end_that_request_first/chunk() takes an uptodate argument. we account
+ * any value <= as an io error. 0 means -EIO for compatability reasons,
+ * any other < 0 value is the direct error type. An uptodate value of
+ * 1 indicates successful io completion
+ */
+#define end_io_error(uptodate)	(unlikely((uptodate) <= 0))
+
 static inline void blkdev_dequeue_request(struct request *req)
 {
 	BUG_ON(list_empty(&req->queuelist));
@@ -588,6 +607,9 @@ extern void blk_queue_prep_rq(request_qu
 extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(request_queue_t *, int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
+extern void blk_queue_ordered(request_queue_t *, int);
+extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *);
+extern int blkdev_scsi_issue_flush_fn(request_queue_t *, struct gendisk *, sector_t *);
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
@@ -616,6 +638,7 @@ extern long blk_congestion_wait(int rw, 
 
 extern void blk_rq_bio_prep(request_queue_t *, struct request *, struct bio *);
 extern void blk_rq_prep_restart(struct request *);
+extern int blkdev_issue_flush(struct block_device *, sector_t *);
 
 #define MAX_PHYS_SEGMENTS 128
 #define MAX_HW_SEGMENTS 128
Index: linux-2.6.8.1-ck9/include/linux/buffer_head.h
===================================================================
--- linux-2.6.8.1-ck9.orig/include/linux/buffer_head.h	2004-06-16 17:35:45.000000000 +1000
+++ linux-2.6.8.1-ck9/include/linux/buffer_head.h	2004-10-02 11:38:03.076988135 +1000
@@ -26,6 +26,8 @@ enum bh_state_bits {
 	BH_Delay,	/* Buffer is not yet allocated on disk */
 	BH_Boundary,	/* Block is followed by a discontiguity */
 	BH_Write_EIO,	/* I/O error on write */
+	BH_Ordered,	/* ordered write */
+	BH_Eopnotsupp,	/* operation not supported (barrier) */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
@@ -110,7 +112,9 @@ BUFFER_FNS(Async_Read, async_read)
 BUFFER_FNS(Async_Write, async_write)
 BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
-BUFFER_FNS(Write_EIO,write_io_error)
+BUFFER_FNS(Write_EIO, write_io_error)
+BUFFER_FNS(Ordered, ordered)
+BUFFER_FNS(Eopnotsupp, eopnotsupp)
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 #define touch_buffer(bh)	mark_page_accessed(bh->b_page)
@@ -172,8 +176,8 @@ void free_buffer_head(struct buffer_head
 void FASTCALL(unlock_buffer(struct buffer_head *bh));
 void FASTCALL(__lock_buffer(struct buffer_head *bh));
 void ll_rw_block(int, int, struct buffer_head * bh[]);
-void sync_dirty_buffer(struct buffer_head *bh);
-void submit_bh(int, struct buffer_head *);
+int sync_dirty_buffer(struct buffer_head *bh);
+int submit_bh(int, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 
Index: linux-2.6.8.1-ck9/include/linux/ext3_fs.h
===================================================================
--- linux-2.6.8.1-ck9.orig/include/linux/ext3_fs.h	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1-ck9/include/linux/ext3_fs.h	2004-10-02 11:38:03.076988135 +1000
@@ -324,6 +324,7 @@ struct ext3_inode {
 #define EXT3_MOUNT_NO_UID32		0x2000  /* Disable 32-bit UIDs */
 #define EXT3_MOUNT_XATTR_USER		0x4000	/* Extended user attributes */
 #define EXT3_MOUNT_POSIX_ACL		0x8000	/* POSIX Access Control Lists */
+#define EXT3_MOUNT_BARRIER		0x10000 /* Use block barriers */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
Index: linux-2.6.8.1-ck9/include/linux/fs.h
===================================================================
--- linux-2.6.8.1-ck9.orig/include/linux/fs.h	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1-ck9/include/linux/fs.h	2004-10-02 11:38:03.077987979 +1000
@@ -88,6 +88,7 @@ extern int leases_enable, dir_notify_ena
 #define SPECIAL 4	/* For non-blockdevice requests in request queue */
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
 #define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
+#define WRITE_BARRIER	((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
 
 #define SEL_IN		1
 #define SEL_OUT		2
Index: linux-2.6.8.1-ck9/include/linux/ide.h
===================================================================
--- linux-2.6.8.1-ck9.orig/include/linux/ide.h	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1-ck9/include/linux/ide.h	2004-10-02 11:38:03.078987823 +1000
@@ -780,6 +780,7 @@ typedef struct ide_drive_s {
 	u8	sect;		/* "real" sectors per track */
 	u8	bios_head;	/* BIOS/fdisk/LILO number of heads */
 	u8	bios_sect;	/* BIOS/fdisk/LILO sectors per track */
+	u8	doing_barrier;	/* state, 1=currently doing flush */
 
 	unsigned int	bios_cyl;	/* BIOS/fdisk/LILO number of cyls */
 	unsigned int	cyl;		/* "real" number of cyls */
@@ -1293,6 +1294,11 @@ extern ide_startstop_t ide_do_reset (ide
 extern void ide_init_drive_cmd (struct request *rq);
 
 /*
+ * this function returns error location sector offset in case of a write error
+ */
+extern u64 ide_get_error_location(ide_drive_t *, char *);
+
+/*
  * "action" parameter type for ide_do_drive_cmd() below.
  */
 typedef enum {
@@ -1664,4 +1670,11 @@ extern struct semaphore ide_cfg_sem;
 
 extern struct bus_type ide_bus_type;
 
+/* check if CACHE FLUSH (EXT) command is supported (bits defined in ATA-6) */
+#define ide_id_has_flush_cache(id)	((id)->cfs_enable_2 & 0x3000)
+
+/* some Maxtor disks have bit 13 defined incorrectly so check bit 10 too */
+#define ide_id_has_flush_cache_ext(id)	\
+	(((id)->cfs_enable_2 & 0x2400) == 0x2400)
+
 #endif /* _IDE_H */
Index: linux-2.6.8.1-ck9/include/linux/jbd.h
===================================================================
--- linux-2.6.8.1-ck9.orig/include/linux/jbd.h	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1-ck9/include/linux/jbd.h	2004-10-02 11:38:03.079987667 +1000
@@ -840,6 +840,7 @@ struct journal_s
 #define JFS_ACK_ERR	0x004	/* The errno in the sb has been acked */
 #define JFS_FLUSHED	0x008	/* The journal superblock has been flushed */
 #define JFS_LOADED	0x010	/* The journal superblock has been loaded */
+#define JFS_BARRIER	0x020	/* Use IDE barriers */
 
 /* 
  * Function declarations for the journaling transaction and buffer
Index: linux-2.6.8.1-ck9/include/linux/reiserfs_fs.h
===================================================================
--- linux-2.6.8.1-ck9.orig/include/linux/reiserfs_fs.h	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1-ck9/include/linux/reiserfs_fs.h	2004-10-02 11:38:03.080987510 +1000
@@ -1777,7 +1777,8 @@ int reiserfs_end_persistent_transaction(
 int reiserfs_commit_page(struct inode *inode, struct page *page,
 		unsigned from, unsigned to);
 int reiserfs_flush_old_commits(struct super_block *);
-void reiserfs_commit_for_inode(struct inode *) ;
+int reiserfs_commit_for_inode(struct inode *) ;
+int  reiserfs_inode_needs_commit(struct inode *) ;
 void reiserfs_update_inode_transaction(struct inode *) ;
 void reiserfs_wait_on_write_block(struct super_block *s) ;
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ;
Index: linux-2.6.8.1-ck9/include/linux/reiserfs_fs_sb.h
===================================================================
--- linux-2.6.8.1-ck9.orig/include/linux/reiserfs_fs_sb.h	2004-06-16 17:35:46.000000000 +1000
+++ linux-2.6.8.1-ck9/include/linux/reiserfs_fs_sb.h	2004-10-02 11:38:03.081987354 +1000
@@ -444,6 +444,8 @@ enum reiserfs_mount_options {
     REISERFS_XATTRS,
     REISERFS_XATTRS_USER,
     REISERFS_POSIXACL,
+    REISERFS_BARRIER_NONE,
+    REISERFS_BARRIER_FLUSH,
 
     REISERFS_TEST1,
     REISERFS_TEST2,
@@ -473,6 +475,8 @@ enum reiserfs_mount_options {
 #define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
 #define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
 #define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
+#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
+#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
 
 void reiserfs_file_buffer (struct buffer_head * bh, int list);
 extern struct file_system_type reiserfs_fs_type;
Index: linux-2.6.8.1-ck9/include/scsi/scsi_driver.h
===================================================================
--- linux-2.6.8.1-ck9.orig/include/scsi/scsi_driver.h	2004-03-11 21:29:26.000000000 +1100
+++ linux-2.6.8.1-ck9/include/scsi/scsi_driver.h	2004-10-02 11:38:03.082987198 +1000
@@ -13,6 +13,7 @@ struct scsi_driver {
 
 	int (*init_command)(struct scsi_cmnd *);
 	void (*rescan)(struct device *);
+	int (*issue_flush)(struct device *, sector_t *);
 };
 #define to_scsi_driver(drv) \
 	container_of((drv), struct scsi_driver, gendrv)