Index: linux-2.6.8.1-ck9/drivers/block/ll_rw_blk.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/block/ll_rw_blk.c 2004-08-15 14:08:05.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/block/ll_rw_blk.c 2004-10-02 11:38:02.986002354 +1000 @@ -263,6 +263,45 @@ void blk_queue_make_request(request_queu EXPORT_SYMBOL(blk_queue_make_request); /** + * blk_queue_ordered - does this queue support ordered writes + * @q: the request queue + * @flag: see below + * + * Description: + * For journalled file systems, doing ordered writes on a commit + * block instead of explicitly doing wait_on_buffer (which is bad + * for performance) can be a big win. Block drivers supporting this + * feature should call this function and indicate so. + * + **/ +void blk_queue_ordered(request_queue_t *q, int flag) +{ + if (flag) + set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags); + else + clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags); +} + +EXPORT_SYMBOL(blk_queue_ordered); + +/** + * blk_queue_issue_flush_fn - set function for issuing a flush + * @q: the request queue + * @iff: the function to be called issuing the flush + * + * Description: + * If a driver supports issuing a flush command, the support is notified + * to the block layer by defining it through this call. + * + **/ +void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff) +{ + q->issue_flush_fn = iff; +} + +EXPORT_SYMBOL(blk_queue_issue_flush_fn); + +/** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device * @dma_addr: bus address limit @@ -482,15 +521,14 @@ struct request *blk_queue_find_tag(reque EXPORT_SYMBOL(blk_queue_find_tag); /** - * blk_queue_free_tags - release tag maintenance info + * __blk_queue_free_tags - release tag maintenance info * @q: the request queue for the device * * Notes: * blk_cleanup_queue() will take care of calling this function, if tagging - * has been used. So there's usually no need to call this directly, unless - * tagging is just being disabled but the queue remains in function. + * has been used. So there's no need to call this directly. **/ -void blk_queue_free_tags(request_queue_t *q) +static void __blk_queue_free_tags(request_queue_t *q) { struct blk_queue_tag *bqt = q->queue_tags; @@ -514,12 +552,27 @@ void blk_queue_free_tags(request_queue_t q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED); } +/** + * blk_queue_free_tags - release tag maintenance info + * @q: the request queue for the device + * + * Notes: + * This is used to disabled tagged queuing to a device, yet leave + * queue in function. + **/ +void blk_queue_free_tags(request_queue_t *q) +{ + clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); +} + EXPORT_SYMBOL(blk_queue_free_tags); static int init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) { int bits, i; + struct request **tag_index; + unsigned long *tag_map; if (depth > q->nr_requests * 2) { depth = q->nr_requests * 2; @@ -527,32 +580,31 @@ init_tag_map(request_queue_t *q, struct __FUNCTION__, depth); } - tags->tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC); - if (!tags->tag_index) + tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC); + if (!tag_index) goto fail; bits = (depth / BLK_TAGS_PER_LONG) + 1; - tags->tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC); - if (!tags->tag_map) + tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC); + if (!tag_map) goto fail; - memset(tags->tag_index, 0, depth * sizeof(struct request *)); - memset(tags->tag_map, 0, bits * sizeof(unsigned long)); + memset(tag_index, 0, depth * sizeof(struct request *)); + memset(tag_map, 0, bits * sizeof(unsigned long)); tags->max_depth = depth; tags->real_max_depth = bits * BITS_PER_LONG; + tags->tag_index = tag_index; + tags->tag_map = tag_map; /* * set the upper bits if the depth isn't a multiple of the word size */ for (i = depth; i < bits * BLK_TAGS_PER_LONG; i++) - __set_bit(i, tags->tag_map); + __set_bit(i, tag_map); - INIT_LIST_HEAD(&tags->busy_list); - tags->busy = 0; - atomic_set(&tags->refcnt, 1); return 0; fail: - kfree(tags->tag_index); + kfree(tag_index); return -ENOMEM; } @@ -564,13 +616,26 @@ fail: int blk_queue_init_tags(request_queue_t *q, int depth, struct blk_queue_tag *tags) { - if (!tags) { + int rc; + + BUG_ON(tags && q->queue_tags && tags != q->queue_tags); + + if (!tags && !q->queue_tags) { tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); if (!tags) goto fail; if (init_tag_map(q, tags, depth)) goto fail; + + INIT_LIST_HEAD(&tags->busy_list); + tags->busy = 0; + atomic_set(&tags->refcnt, 1); + } else if (q->queue_tags) { + if ((rc = blk_queue_resize_tags(q, depth))) + return rc; + set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); + return 0; } else atomic_inc(&tags->refcnt); @@ -1335,8 +1400,8 @@ void blk_cleanup_queue(request_queue_t * if (rl->rq_pool) mempool_destroy(rl->rq_pool); - if (blk_queue_tagged(q)) - blk_queue_free_tags(q); + if (q->queue_tags) + __blk_queue_free_tags(q); kmem_cache_free(requestq_cachep, q); } @@ -1925,10 +1990,11 @@ int blk_execute_rq(request_queue_t *q, s } rq->flags |= REQ_NOMERGE; - rq->waiting = &wait; + if (!rq->waiting) + rq->waiting = &wait; elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1); generic_unplug_device(q); - wait_for_completion(&wait); + wait_for_completion(rq->waiting); rq->waiting = NULL; if (rq->errors) @@ -1939,6 +2005,72 @@ int blk_execute_rq(request_queue_t *q, s EXPORT_SYMBOL(blk_execute_rq); +/** + * blkdev_issue_flush - queue a flush + * @bdev: blockdev to issue flush for + * @error_sector: error sector + * + * Description: + * Issue a flush for the block device in question. Caller can supply + * room for storing the error offset in case of a flush error, if they + * wish to. Caller must run wait_for_completion() on its own. + */ +int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) +{ + request_queue_t *q; + + if (bdev->bd_disk == NULL) + return -ENXIO; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + if (!q->issue_flush_fn) + return -EOPNOTSUPP; + + return q->issue_flush_fn(q, bdev->bd_disk, error_sector); +} + +EXPORT_SYMBOL(blkdev_issue_flush); + +/** + * blkdev_scsi_issue_flush_fn - issue flush for SCSI devices + * @q: device queue + * @disk: gendisk + * @error_sector: error offset + * + * Description: + * Devices understanding the SCSI command set, can use this function as + * a helper for issuing a cache flush. Note: driver is required to store + * the error offset (in case of error flushing) in ->sector of struct + * request. + */ +int blkdev_scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + struct request *rq = blk_get_request(q, WRITE, __GFP_WAIT); + int ret; + + rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER; + rq->sector = 0; + memset(rq->cmd, 0, sizeof(rq->cmd)); + rq->cmd[0] = 0x35; + rq->cmd_len = 12; + rq->data = NULL; + rq->data_len = 0; + rq->timeout = 60 * HZ; + + ret = blk_execute_rq(q, disk, rq); + + if (ret && error_sector) + *error_sector = rq->sector; + + blk_put_request(rq); + return ret; +} + +EXPORT_SYMBOL(blkdev_scsi_issue_flush_fn); + void drive_stat_acct(struct request *rq, int nr_sectors, int new_io) { int rw = rq_data_dir(rq); @@ -2192,7 +2324,7 @@ EXPORT_SYMBOL(__blk_attempt_remerge); static int __make_request(request_queue_t *q, struct bio *bio) { struct request *req, *freereq = NULL; - int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra; + int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err; sector_t sector; sector = bio->bi_sector; @@ -2210,9 +2342,11 @@ static int __make_request(request_queue_ spin_lock_prefetch(q->queue_lock); - barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw); - - ra = bio->bi_rw & (1 << BIO_RW_AHEAD); + barrier = bio_barrier(bio); + if (barrier && !(q->queue_flags & (1 << QUEUE_FLAG_ORDERED))) { + err = -EOPNOTSUPP; + goto end_io; + } again: spin_lock_irq(q->queue_lock); @@ -2292,7 +2426,8 @@ get_rq: /* * READA bit set */ - if (ra) + err = -EWOULDBLOCK; + if (bio_rw_ahead(bio)) goto end_io; freereq = get_request_wait(q, rw); @@ -2303,10 +2438,9 @@ get_rq: req->flags |= REQ_CMD; /* - * inherit FAILFAST from bio and don't stack up - * retries for read ahead + * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) */ - if (ra || test_bit(BIO_RW_FAILFAST, &bio->bi_rw)) + if (bio_rw_ahead(bio) || bio_failfast(bio)) req->flags |= REQ_FAILFAST; /* @@ -2340,7 +2474,7 @@ out: return 0; end_io: - bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK); + bio_endio(bio, nr_sectors << 9, err); return 0; } @@ -2647,10 +2781,17 @@ void blk_recalc_rq_sectors(struct reques static int __end_that_request_first(struct request *req, int uptodate, int nr_bytes) { - int total_bytes, bio_nbytes, error = 0, next_idx = 0; + int total_bytes, bio_nbytes, error, next_idx = 0; struct bio *bio; /* + * extend uptodate bool to allow < 0 value to be direct io error + */ + error = 0; + if (end_io_error(uptodate)) + error = !uptodate ? -EIO : uptodate; + + /* * for a REQ_BLOCK_PC request, we want to carry any eventual * sense key with us all the way through */ @@ -2658,7 +2799,6 @@ static int __end_that_request_first(stru req->errors = 0; if (!uptodate) { - error = -EIO; if (blk_fs_request(req) && !(req->flags & REQ_QUIET)) printk("end_request: I/O error, dev %s, sector %llu\n", req->rq_disk ? req->rq_disk->disk_name : "?", @@ -2741,7 +2881,7 @@ static int __end_that_request_first(stru /** * end_that_request_first - end I/O on a request * @req: the request being processed - * @uptodate: 0 for I/O error + * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error * @nr_sectors: number of sectors to end I/O on * * Description: @@ -2762,7 +2902,7 @@ EXPORT_SYMBOL(end_that_request_first); /** * end_that_request_chunk - end I/O on a request * @req: the request being processed - * @uptodate: 0 for I/O error + * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error * @nr_bytes: number of bytes to complete * * Description: Index: linux-2.6.8.1-ck9/drivers/ide/ide.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/ide/ide.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/ide/ide.c 2004-10-02 11:38:02.988002042 +1000 @@ -437,6 +437,30 @@ u8 ide_dump_status (ide_drive_t *drive, #endif /* FANCY_STATUS_DUMPS */ printk("\n"); } + { + struct request *rq; + int opcode = 0x100; + + spin_lock(&ide_lock); + rq = HWGROUP(drive)->rq; + spin_unlock(&ide_lock); + if (!rq) + goto out; + if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) { + char *args = rq->buffer; + if (args) + opcode = args[0]; + } else if (rq->flags & REQ_DRIVE_TASKFILE) { + ide_task_t *args = rq->special; + if (args) { + task_struct_t *tf = (task_struct_t *) args->tfRegister; + opcode = tf->command; + } + } + + printk("ide: failed opcode was %x\n", opcode); + } +out: local_irq_restore(flags); return err; } Index: linux-2.6.8.1-ck9/drivers/ide/ide-disk.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/ide/ide-disk.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/ide/ide-disk.c 2004-10-02 11:38:02.989001885 +1000 @@ -702,6 +702,37 @@ static u8 idedisk_dump_status (ide_drive } #endif /* FANCY_STATUS_DUMPS */ printk("\n"); + { + struct request *rq; + unsigned char opcode = 0; + int found = 0; + + spin_lock(&ide_lock); + rq = HWGROUP(drive)->rq; + spin_unlock(&ide_lock); + if (!rq) + goto out; + if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) { + char *args = rq->buffer; + if (args) { + opcode = args[0]; + found = 1; + } + } else if (rq->flags & REQ_DRIVE_TASKFILE) { + ide_task_t *args = rq->special; + if (args) { + task_struct_t *tf = (task_struct_t *) args->tfRegister; + opcode = tf->command; + found = 1; + } + } + printk("ide: failed opcode was: "); + if (!found) + printk("unknown\n"); + else + printk("0x%02x\n", opcode); + } +out: local_irq_restore(flags); return err; } @@ -1203,6 +1234,42 @@ static ide_proc_entry_t idedisk_proc[] = #endif /* CONFIG_PROC_FS */ +static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + ide_drive_t *drive = q->queuedata; + struct request *rq; + int ret; + + if (!drive->wcache) + return 0; + + rq = blk_get_request(q, WRITE, __GFP_WAIT); + + memset(rq->cmd, 0, sizeof(rq->cmd)); + + if (ide_id_has_flush_cache_ext(drive->id) && + (drive->capacity64 >= (1UL << 28))) + rq->cmd[0] = WIN_FLUSH_CACHE_EXT; + else + rq->cmd[0] = WIN_FLUSH_CACHE; + + + rq->flags |= REQ_DRIVE_TASK | REQ_SOFTBARRIER; + rq->buffer = rq->cmd; + + ret = blk_execute_rq(q, disk, rq); + + /* + * if we failed and caller wants error offset, get it + */ + if (ret && error_sector) + *error_sector = ide_get_error_location(drive, rq->cmd); + + blk_put_request(rq); + return ret; +} + /* * This is tightly woven into the driver->do_special can not touch. * DON'T do it again until a total personality rewrite is committed. @@ -1231,16 +1298,10 @@ static int set_nowerr(ide_drive_t *drive return 0; } -/* check if CACHE FLUSH (EXT) command is supported (bits defined in ATA-6) */ -#define ide_id_has_flush_cache(id) ((id)->cfs_enable_2 & 0x3000) - -/* some Maxtor disks have bit 13 defined incorrectly so check bit 10 too */ -#define ide_id_has_flush_cache_ext(id) \ - (((id)->cfs_enable_2 & 0x2400) == 0x2400) - static int write_cache (ide_drive_t *drive, int arg) { ide_task_t args; + int err; if (!ide_id_has_flush_cache(drive->id)) return 1; @@ -1251,7 +1312,10 @@ static int write_cache (ide_drive_t *dri args.tfRegister[IDE_COMMAND_OFFSET] = WIN_SETFEATURES; args.command_type = IDE_DRIVE_TASK_NO_DATA; args.handler = &task_no_data_intr; - (void) ide_raw_taskfile(drive, &args, NULL); + + err = ide_raw_taskfile(drive, &args, NULL); + if (err) + return err; drive->wcache = arg; return 0; @@ -1412,6 +1476,7 @@ static void idedisk_setup (ide_drive_t * { struct hd_driveid *id = drive->id; unsigned long long capacity; + int barrier; idedisk_add_settings(drive); @@ -1543,6 +1608,27 @@ static void idedisk_setup (ide_drive_t * drive->wcache = 1; write_cache(drive, 1); + + /* + * decide if we can sanely support flushes and barriers on + * this drive. unfortunately not all drives advertise FLUSH_CACHE + * support even if they support it. So assume FLUSH_CACHE is there + * always. LBA48 drives are newer, so expect it to flag support + * properly. We can safely support FLUSH_CACHE on lba48, if capacity + * doesn't exceed lba28 + */ + barrier = 1; + if (drive->addressing == 1) { + if (capacity > (1ULL << 28) && !ide_id_has_flush_cache_ext(id)) + barrier = 0; + } + + printk("%s: cache flushes %ssupported\n", + drive->name, barrier ? "" : "not "); + if (barrier) { + blk_queue_ordered(drive->queue, 1); + blk_queue_issue_flush_fn(drive->queue, idedisk_issue_flush); + } } static void ide_cacheflush_p(ide_drive_t *drive) Index: linux-2.6.8.1-ck9/drivers/ide/ide-io.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/ide/ide-io.c 2004-06-16 17:35:36.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/ide/ide-io.c 2004-10-02 11:38:02.991001573 +1000 @@ -54,38 +54,77 @@ #include #include -/** - * ide_end_request - complete an IDE I/O - * @drive: IDE device for the I/O - * @uptodate: - * @nr_sectors: number of sectors completed - * - * This is our end_request wrapper function. We complete the I/O - * update random number input and dequeue the request, which if - * it was tagged may be out of order. +static void ide_fill_flush_cmd(ide_drive_t *drive, struct request *rq) +{ + char *buf = rq->cmd; + + /* + * reuse cdb space for ata command + */ + memset(buf, 0, sizeof(rq->cmd)); + + rq->flags |= REQ_DRIVE_TASK | REQ_STARTED; + rq->buffer = buf; + rq->buffer[0] = WIN_FLUSH_CACHE; + + if (ide_id_has_flush_cache_ext(drive->id) && + (drive->capacity64 >= (1UL << 28))) + rq->buffer[0] = WIN_FLUSH_CACHE_EXT; +} + +/* + * preempt pending requests, and store this cache flush for immediate + * execution */ - -int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors) +static struct request *ide_queue_flush_cmd(ide_drive_t *drive, + struct request *rq, int post) { - struct request *rq; - unsigned long flags; - int ret = 1; + struct request *flush_rq = &HWGROUP(drive)->wrq; - spin_lock_irqsave(&ide_lock, flags); - rq = HWGROUP(drive)->rq; + /* + * write cache disabled, clear the barrier bit and treat it like + * an ordinary write + */ + if (!drive->wcache) { + rq->flags |= REQ_BAR_PREFLUSH; + return rq; + } - BUG_ON(!(rq->flags & REQ_STARTED)); + ide_init_drive_cmd(flush_rq); + ide_fill_flush_cmd(drive, flush_rq); - if (!nr_sectors) - nr_sectors = rq->hard_cur_sectors; + flush_rq->special = rq; + flush_rq->nr_sectors = rq->nr_sectors; + + if (!post) { + drive->doing_barrier = 1; + flush_rq->flags |= REQ_BAR_PREFLUSH; + blkdev_dequeue_request(rq); + } else + flush_rq->flags |= REQ_BAR_POSTFLUSH; + + __elv_add_request(drive->queue, flush_rq, ELEVATOR_INSERT_FRONT, 0); + HWGROUP(drive)->rq = NULL; + return flush_rq; +} + +static int __ide_end_request(ide_drive_t *drive, struct request *rq, + int uptodate, int nr_sectors) +{ + int ret = 1; + + BUG_ON(!(rq->flags & REQ_STARTED)); /* * if failfast is set on a request, override number of sectors and * complete the whole request right now */ - if (blk_noretry_request(rq) && !uptodate) + if (blk_noretry_request(rq) && end_io_error(uptodate)) nr_sectors = rq->hard_nr_sectors; + if (!blk_fs_request(rq) && end_io_error(uptodate) && !rq->errors) + rq->errors = -EIO; + /* * decide whether to reenable DMA -- 3 is a random magic for now, * if we DMA timeout more than 3 times, just stay in PIO @@ -97,15 +136,56 @@ int ide_end_request (ide_drive_t *drive, if (!end_that_request_first(rq, uptodate, nr_sectors)) { add_disk_randomness(rq->rq_disk); + + if (blk_rq_tagged(rq)) + blk_queue_end_tag(drive->queue, rq); + blkdev_dequeue_request(rq); HWGROUP(drive)->rq = NULL; end_that_request_last(rq); ret = 0; } - spin_unlock_irqrestore(&ide_lock, flags); return ret; } +/** + * ide_end_request - complete an IDE I/O + * @drive: IDE device for the I/O + * @uptodate: + * @nr_sectors: number of sectors completed + * + * This is our end_request wrapper function. We complete the I/O + * update random number input and dequeue the request, which if + * it was tagged may be out of order. + */ + +int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors) +{ + struct request *rq; + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&ide_lock, flags); + rq = HWGROUP(drive)->rq; + + if (!nr_sectors) + nr_sectors = rq->hard_cur_sectors; + + if (!blk_barrier_rq(rq) || !drive->wcache) + ret = __ide_end_request(drive, rq, uptodate, nr_sectors); + else { + struct request *flush_rq = &HWGROUP(drive)->wrq; + + flush_rq->nr_sectors -= nr_sectors; + if (!flush_rq->nr_sectors) { + ide_queue_flush_cmd(drive, rq, 1); + ret = 0; + } + } + + spin_unlock_irqrestore(&ide_lock, flags); + return ret; +} EXPORT_SYMBOL(ide_end_request); /** @@ -137,6 +217,113 @@ static void ide_complete_pm_request (ide spin_unlock_irqrestore(&ide_lock, flags); } +/* + * FIXME: probably move this somewhere else, name is bad too :) + */ +u64 ide_get_error_location(ide_drive_t *drive, char *args) +{ + u32 high, low; + u8 hcyl, lcyl, sect; + u64 sector; + + high = 0; + hcyl = args[5]; + lcyl = args[4]; + sect = args[3]; + + if (ide_id_has_flush_cache_ext(drive->id)) { + low = (hcyl << 16) | (lcyl << 8) | sect; + HWIF(drive)->OUTB(drive->ctl|0x80, IDE_CONTROL_REG); + high = ide_read_24(drive); + } else { + u8 cur = HWIF(drive)->INB(IDE_SELECT_REG); + if (cur & 0x40) + low = (hcyl << 16) | (lcyl << 8) | sect; + else { + low = hcyl * drive->head * drive->sect; + low += lcyl * drive->sect; + low += sect - 1; + } + } + + sector = ((u64) high << 24) | low; + return sector; +} +EXPORT_SYMBOL(ide_get_error_location); + +static void ide_complete_barrier(ide_drive_t *drive, struct request *rq, + int error) +{ + struct request *real_rq = rq->special; + int good_sectors, bad_sectors; + sector_t sector; + + if (!error) { + if (blk_barrier_postflush(rq)) { + /* + * this completes the barrier write + */ + __ide_end_request(drive, real_rq, 1, real_rq->hard_nr_sectors); + drive->doing_barrier = 0; + } else { + /* + * just indicate that we did the pre flush + */ + real_rq->flags |= REQ_BAR_PREFLUSH; + elv_requeue_request(drive->queue, real_rq); + } + /* + * all is fine, return + */ + return; + } + + /* + * we need to end real_rq, but it's not on the queue currently. + * put it back on the queue, so we don't have to special case + * anything else for completing it + */ + if (!blk_barrier_postflush(rq)) + elv_requeue_request(drive->queue, real_rq); + + /* + * drive aborted flush command, assume FLUSH_CACHE_* doesn't + * work and disable barrier support + */ + if (error & ABRT_ERR) { + printk(KERN_ERR "%s: barrier support doesn't work\n", drive->name); + __ide_end_request(drive, real_rq, -EOPNOTSUPP, real_rq->hard_nr_sectors); + blk_queue_ordered(drive->queue, 0); + blk_queue_issue_flush_fn(drive->queue, NULL); + } else { + /* + * find out what part of the request failed + */ + good_sectors = 0; + if (blk_barrier_postflush(rq)) { + sector = ide_get_error_location(drive, rq->buffer); + + if ((sector >= real_rq->hard_sector) && + (sector < real_rq->hard_sector + real_rq->hard_nr_sectors)) + good_sectors = sector - real_rq->hard_sector; + } else + sector = real_rq->hard_sector; + + bad_sectors = real_rq->hard_nr_sectors - good_sectors; + if (good_sectors) + __ide_end_request(drive, real_rq, 1, good_sectors); + if (bad_sectors) + __ide_end_request(drive, real_rq, 0, bad_sectors); + + printk(KERN_ERR "%s: failed barrier write: " + "sector=%Lx(good=%d/bad=%d)\n", + drive->name, (unsigned long long)sector, + good_sectors, bad_sectors); + } + + drive->doing_barrier = 0; +} + /** * ide_end_drive_cmd - end an explicit drive command * @drive: command @@ -226,6 +413,10 @@ void ide_end_drive_cmd (ide_drive_t *dri spin_lock_irqsave(&ide_lock, flags); blkdev_dequeue_request(rq); + + if (blk_barrier_preflush(rq) || blk_barrier_postflush(rq)) + ide_complete_barrier(drive, rq, err); + HWGROUP(drive)->rq = NULL; end_that_request_last(rq); spin_unlock_irqrestore(&ide_lock, flags); @@ -712,6 +903,22 @@ static inline ide_drive_t *choose_drive repeat: best = NULL; drive = hwgroup->drive; + + /* + * drive is doing pre-flush, ordered write, post-flush sequence. even + * though that is 3 requests, it must be seen as a single transaction. + * we must not preempt this drive until that is complete + */ + if (drive->doing_barrier) { + /* + * small race where queue could get replugged during + * the 3-request flush cycle, just yank the plug since + * we want it to finish asap + */ + blk_remove_plug(drive->queue); + return drive; + } + do { if ((!drive->sleep || time_after_eq(jiffies, drive->sleep)) && !elv_queue_empty(drive->queue)) { @@ -868,6 +1075,13 @@ void ide_do_request (ide_hwgroup_t *hwgr } /* + * if rq is a barrier write, issue pre cache flush if not + * already done + */ + if (blk_barrier_rq(rq) && !blk_barrier_preflush(rq)) + rq = ide_queue_flush_cmd(drive, rq, 0); + + /* * Sanity: don't accept a request that isn't a PM request * if we are currently power managed. This is very important as * blk_stop_queue() doesn't prevent the elv_next_request() @@ -917,7 +1131,9 @@ EXPORT_SYMBOL(ide_do_request); */ void do_ide_request(request_queue_t *q) { - ide_do_request(q->queuedata, IDE_NO_IRQ); + ide_drive_t *drive = q->queuedata; + + ide_do_request(HWGROUP(drive), IDE_NO_IRQ); } /* @@ -1286,6 +1502,7 @@ void ide_init_drive_cmd (struct request { memset(rq, 0, sizeof(*rq)); rq->flags = REQ_DRIVE_CMD; + rq->ref_count = 1; } EXPORT_SYMBOL(ide_init_drive_cmd); Index: linux-2.6.8.1-ck9/drivers/ide/ide-probe.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/ide/ide-probe.c 2004-06-16 17:35:36.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/ide/ide-probe.c 2004-10-02 11:38:02.993001260 +1000 @@ -893,7 +893,7 @@ static int ide_init_queue(ide_drive_t *d if (!q) return 1; - q->queuedata = HWGROUP(drive); + q->queuedata = drive; blk_queue_segment_boundary(q, 0xffff); if (!hwif->rqsize) Index: linux-2.6.8.1-ck9/drivers/md/dm.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/md/dm.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/md/dm.c 2004-10-02 11:38:02.994001104 +1000 @@ -597,6 +597,21 @@ static int dm_request(request_queue_t *q return 0; } +static int dm_flush_all(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_table(md); + int ret = -ENXIO; + + if (map) { + ret = dm_table_flush_all(md->map); + dm_table_put(map); + } + + return ret; +} + static void dm_unplug_all(request_queue_t *q) { struct mapped_device *md = q->queuedata; @@ -764,6 +779,7 @@ static struct mapped_device *alloc_dev(u md->queue->backing_dev_info.congested_data = md; blk_queue_make_request(md->queue, dm_request); md->queue->unplug_fn = dm_unplug_all; + md->queue->issue_flush_fn = dm_flush_all; md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, mempool_free_slab, _io_cache); Index: linux-2.6.8.1-ck9/drivers/md/dm.h =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/md/dm.h 2004-06-16 17:35:36.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/md/dm.h 2004-10-02 11:38:02.994001104 +1000 @@ -113,6 +113,7 @@ void dm_table_suspend_targets(struct dm_ void dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); void dm_table_unplug_all(struct dm_table *t); +int dm_table_flush_all(struct dm_table *t); /*----------------------------------------------------------------- * A registry of target types. Index: linux-2.6.8.1-ck9/drivers/md/dm-table.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/md/dm-table.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/md/dm-table.c 2004-10-02 11:38:02.995000948 +1000 @@ -900,6 +900,28 @@ void dm_table_unplug_all(struct dm_table } } +int dm_table_flush_all(struct dm_table *t) +{ + struct list_head *d, *devices = dm_table_get_devices(t); + int ret = 0; + + for (d = devices->next; d != devices; d = d->next) { + struct dm_dev *dd = list_entry(d, struct dm_dev, list); + request_queue_t *q = bdev_get_queue(dd->bdev); + int err; + + if (!q->issue_flush_fn) + err = -EOPNOTSUPP; + else + err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL); + + if (!ret) + ret = err; + } + + return ret; +} + EXPORT_SYMBOL(dm_vcalloc); EXPORT_SYMBOL(dm_get_device); EXPORT_SYMBOL(dm_put_device); @@ -908,3 +930,4 @@ EXPORT_SYMBOL(dm_table_get_mode); EXPORT_SYMBOL(dm_table_put); EXPORT_SYMBOL(dm_table_get); EXPORT_SYMBOL(dm_table_unplug_all); +EXPORT_SYMBOL(dm_table_flush_all); Index: linux-2.6.8.1-ck9/drivers/md/linear.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/md/linear.c 2004-05-23 12:54:50.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/md/linear.c 2004-10-02 11:38:02.996000792 +1000 @@ -47,7 +47,6 @@ static inline dev_info_t *which_dev(mdde return hash->dev0; } - /** * linear_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue @@ -93,6 +92,27 @@ static void linear_unplug(request_queue_ } } +static int linear_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + linear_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i=0; i < mddev->raid_disks; i++) { + struct block_device *bdev = conf->disks[i].rdev->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + return ret; +} static int linear_run (mddev_t *mddev) { @@ -200,6 +220,7 @@ static int linear_run (mddev_t *mddev) blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; + mddev->queue->issue_flush_fn = linear_issue_flush; return 0; out: Index: linux-2.6.8.1-ck9/drivers/md/md.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/md/md.c 2004-06-16 17:35:36.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/md/md.c 2004-10-02 11:38:02.998000479 +1000 @@ -154,6 +154,39 @@ static spinlock_t all_mddevs_lock = SPIN tmp = tmp->next;}) \ ) +int md_flush_mddev(mddev_t *mddev, sector_t *error_sector) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + int ret = 0; + + /* + * this list iteration is done without any locking in md?! + */ + ITERATE_RDEV(mddev, rdev, tmp) { + request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + int err; + + if (!r_queue->issue_flush_fn) + err = -EOPNOTSUPP; + else + err = r_queue->issue_flush_fn(r_queue, rdev->bdev->bd_disk, error_sector); + + if (!ret) + ret = err; + } + + return ret; +} + +static int md_flush_all(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + + return md_flush_mddev(mddev, error_sector); +} + static int md_fail_request (request_queue_t *q, struct bio *bio) { bio_io_error(bio, bio->bi_size); @@ -1645,6 +1678,7 @@ static int do_md_run(mddev_t * mddev) */ mddev->queue->queuedata = mddev; mddev->queue->make_request_fn = mddev->pers->make_request; + mddev->queue->issue_flush_fn = md_flush_all; mddev->changed = 1; return 0; Index: linux-2.6.8.1-ck9/drivers/md/multipath.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/md/multipath.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/md/multipath.c 2004-10-02 11:38:02.999000323 +1000 @@ -120,7 +120,7 @@ int multipath_end_request(struct bio *bi if (uptodate) multipath_end_bh_io(mp_bh, uptodate); - else if ((bio->bi_rw & (1 << BIO_RW_AHEAD)) == 0) { + else if (!bio_rw_ahead(bio)) { /* * oops, IO error: */ @@ -217,6 +217,31 @@ static void multipath_status (struct seq seq_printf (seq, "]"); } +static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + multipath_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->multipaths[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + return ret; +} /* * Careful, this can execute in IRQ contexts as well! @@ -435,6 +460,8 @@ static int multipath_run (mddev_t *mddev mddev->queue->unplug_fn = multipath_unplug; + mddev->queue->issue_flush_fn = multipath_issue_flush; + conf->working_disks = 0; ITERATE_RDEV(mddev,rdev,tmp) { disk_idx = rdev->raid_disk; Index: linux-2.6.8.1-ck9/drivers/md/raid0.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/md/raid0.c 2004-05-23 12:54:50.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/md/raid0.c 2004-10-02 11:38:03.000000167 +1000 @@ -40,6 +40,31 @@ static void raid0_unplug(request_queue_t } } +static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + raid0_conf_t *conf = mddev_to_conf(mddev); + mdk_rdev_t **devlist = conf->strip_zone[0].dev; + int i, ret = 0; + + for (i=0; iraid_disks; i++) { + struct block_device *bdev = devlist[i]->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + + ret =r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + return ret; +} + + static int create_strip_zones (mddev_t *mddev) { int i, c, j; @@ -219,6 +244,8 @@ static int create_strip_zones (mddev_t * mddev->queue->unplug_fn = raid0_unplug; + mddev->queue->issue_flush_fn = raid0_issue_flush; + printk("raid0: done.\n"); return 0; abort: Index: linux-2.6.8.1-ck9/drivers/md/raid1.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/md/raid1.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/md/raid1.c 2004-10-02 11:38:03.001000010 +1000 @@ -481,6 +481,32 @@ static void raid1_unplug(request_queue_t unplug_slaves(q->queuedata); } +static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + conf_t *conf = mddev_to_conf(mddev); + unsigned long flags; + int i, ret = 0; + + spin_lock_irqsave(&conf->device_lock, flags); + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (r_queue->issue_flush_fn) { + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + return ret; +} + /* * Throttle resync depth, so that we can both get proper overlapping of * requests, but are still able to handle normal requests quickly. @@ -1168,6 +1194,7 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid1_unplug; + mddev->queue->issue_flush_fn = raid1_issue_flush; ITERATE_RDEV(mddev, rdev, tmp) { disk_idx = rdev->raid_disk; Index: linux-2.6.8.1-ck9/drivers/md/raid5.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/md/raid5.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/md/raid5.c 2004-10-02 11:38:03.001999854 +1000 @@ -1339,6 +1339,39 @@ static void raid5_unplug_device(request_ unplug_slaves(mddev); } +static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + raid5_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->disks[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue; + + if (!bdev) + continue; + + r_queue = bdev_get_queue(bdev); + if (!r_queue) + continue; + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + return ret; +} + static inline void raid5_plug_device(raid5_conf_t *conf) { spin_lock_irq(&conf->device_lock); @@ -1545,6 +1578,7 @@ static int run (mddev_t *mddev) atomic_set(&conf->preread_active_stripes, 0); mddev->queue->unplug_fn = raid5_unplug_device; + mddev->queue->issue_flush_fn = raid5_issue_flush; PRINTK("raid5: run(%s) called.\n", mdname(mddev)); Index: linux-2.6.8.1-ck9/drivers/md/raid6main.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/md/raid6main.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/md/raid6main.c 2004-10-02 11:38:03.003999542 +1000 @@ -1501,6 +1501,39 @@ static void raid6_unplug_device(request_ unplug_slaves(mddev); } +static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + raid6_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->disks[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue; + + if (!bdev) + continue; + + r_queue = bdev_get_queue(bdev); + if (!r_queue) + continue; + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + return ret; +} + static inline void raid6_plug_device(raid6_conf_t *conf) { spin_lock_irq(&conf->device_lock); @@ -1708,6 +1741,7 @@ static int run (mddev_t *mddev) atomic_set(&conf->preread_active_stripes, 0); mddev->queue->unplug_fn = raid6_unplug_device; + mddev->queue->issue_flush_fn = raid6_issue_flush; PRINTK("raid6: run(%s) called.\n", mdname(mddev)); Index: linux-2.6.8.1-ck9/drivers/scsi/scsi_lib.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/scsi/scsi_lib.c 2004-08-15 14:08:08.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/scsi/scsi_lib.c 2004-10-02 11:38:03.014997823 +1000 @@ -954,6 +954,22 @@ static int scsi_init_io(struct scsi_cmnd return BLKPREP_KILL; } +static int scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + struct scsi_device *sdev = q->queuedata; + struct scsi_driver *drv; + + if (sdev->sdev_state != SDEV_RUNNING) + return -ENXIO; + + drv = *(struct scsi_driver **) disk->private_data; + if (drv->issue_flush) + return drv->issue_flush(&sdev->sdev_gendev, error_sector); + + return -EOPNOTSUPP; +} + static int scsi_prep_fn(struct request_queue *q, struct request *req) { struct scsi_device *sdev = q->queuedata; @@ -1335,7 +1351,8 @@ struct request_queue *scsi_alloc_queue(s blk_queue_max_sectors(q, shost->max_sectors); blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); blk_queue_segment_boundary(q, shost->dma_boundary); - + blk_queue_issue_flush_fn(q, scsi_issue_flush_fn); + if (!shost->use_clustering) clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); return q; Index: linux-2.6.8.1-ck9/drivers/scsi/sd.c =================================================================== --- linux-2.6.8.1-ck9.orig/drivers/scsi/sd.c 2004-08-15 14:08:08.000000000 +1000 +++ linux-2.6.8.1-ck9/drivers/scsi/sd.c 2004-10-02 11:38:03.016997510 +1000 @@ -113,6 +113,7 @@ static int sd_remove(struct device *); static void sd_shutdown(struct device *dev); static void sd_rescan(struct device *); static int sd_init_command(struct scsi_cmnd *); +static int sd_issue_flush(struct device *, sector_t *); static void sd_read_capacity(struct scsi_disk *sdkp, char *diskname, struct scsi_request *SRpnt, unsigned char *buffer); @@ -126,6 +127,7 @@ static struct scsi_driver sd_template = }, .rescan = sd_rescan, .init_command = sd_init_command, + .issue_flush = sd_issue_flush, }; /* Device no to disk mapping: @@ -676,6 +678,62 @@ not_present: return 1; } +static int sd_sync_cache(struct scsi_device *sdp) +{ + struct scsi_request *sreq; + int retries, res; + + if (!scsi_device_online(sdp)) + return -ENODEV; + + sreq = scsi_allocate_request(sdp, GFP_KERNEL); + if (!sreq) { + printk("FAILED\n No memory for request\n"); + return -ENOMEM; + } + + sreq->sr_data_direction = DMA_NONE; + for (retries = 3; retries > 0; --retries) { + unsigned char cmd[10] = { 0 }; + + cmd[0] = SYNCHRONIZE_CACHE; + /* + * Leave the rest of the command zero to indicate + * flush everything. + */ + scsi_wait_req(sreq, cmd, NULL, 0, SD_TIMEOUT, SD_MAX_RETRIES); + if (sreq->sr_result == 0) + break; + } + + res = sreq->sr_result; + if (res) { + printk(KERN_WARNING "FAILED\n status = %x, message = %02x, " + "host = %d, driver = %02x\n ", + status_byte(res), msg_byte(res), + host_byte(res), driver_byte(res)); + if (driver_byte(res) & DRIVER_SENSE) + scsi_print_req_sense("sd", sreq); + } + + scsi_release_request(sreq); + return res; +} + +static int sd_issue_flush(struct device *dev, sector_t *error_sector) +{ + struct scsi_device *sdp = to_scsi_device(dev); + struct scsi_disk *sdkp = dev_get_drvdata(dev); + + if (!sdkp) + return -ENODEV; + + if (!sdkp->WCE) + return 0; + + return sd_sync_cache(sdp); +} + static void sd_rescan(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); @@ -1503,52 +1561,17 @@ static void scsi_disk_release(struct kre static void sd_shutdown(struct device *dev) { struct scsi_device *sdp = to_scsi_device(dev); - struct scsi_disk *sdkp; - struct scsi_request *sreq; - int retries, res; + struct scsi_disk *sdkp = dev_get_drvdata(dev); - sdkp = dev_get_drvdata(dev); if (!sdkp) - return; /* this can happen */ + return; /* this can happen */ - if (!scsi_device_online(sdp) || !sdkp->WCE) + if (!sdkp->WCE) return; - printk(KERN_NOTICE "Synchronizing SCSI cache for disk %s: ", + printk(KERN_NOTICE "Synchronizing SCSI cache for disk %s: \n", sdkp->disk->disk_name); - - sreq = scsi_allocate_request(sdp, GFP_KERNEL); - if (!sreq) { - printk("FAILED\n No memory for request\n"); - return; - } - - sreq->sr_data_direction = DMA_NONE; - for (retries = 3; retries > 0; --retries) { - unsigned char cmd[10] = { 0 }; - - cmd[0] = SYNCHRONIZE_CACHE; - /* - * Leave the rest of the command zero to indicate - * flush everything. - */ - scsi_wait_req(sreq, cmd, NULL, 0, SD_TIMEOUT, SD_MAX_RETRIES); - if (sreq->sr_result == 0) - break; - } - - res = sreq->sr_result; - if (res) { - printk(KERN_WARNING "FAILED\n status = %x, message = %02x, " - "host = %d, driver = %02x\n ", - status_byte(res), msg_byte(res), - host_byte(res), driver_byte(res)); - if (driver_byte(res) & DRIVER_SENSE) - scsi_print_req_sense("sd", sreq); - } - - scsi_release_request(sreq); - printk("\n"); + sd_sync_cache(sdp); } /** Index: linux-2.6.8.1-ck9/fs/buffer.c =================================================================== --- linux-2.6.8.1-ck9.orig/fs/buffer.c 2004-08-15 14:08:13.000000000 +1000 +++ linux-2.6.8.1-ck9/fs/buffer.c 2004-10-02 11:38:03.032995010 +1000 @@ -213,7 +213,7 @@ void end_buffer_write_sync(struct buffer if (uptodate) { set_buffer_uptodate(bh); } else { - if (printk_ratelimit()) { + if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { buffer_io_error(bh); printk(KERN_WARNING "lost page write due to " "I/O error on %s\n", @@ -2756,21 +2756,33 @@ static int end_bio_bh_io_sync(struct bio if (bio->bi_size) return 1; + if (err == -EOPNOTSUPP) { + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + set_bit(BH_Eopnotsupp, &bh->b_state); + } + bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); bio_put(bio); return 0; } -void submit_bh(int rw, struct buffer_head * bh) +int submit_bh(int rw, struct buffer_head * bh) { struct bio *bio; + int ret = 0; BUG_ON(!buffer_locked(bh)); BUG_ON(!buffer_mapped(bh)); BUG_ON(!bh->b_end_io); - /* Only clear out a write error when rewriting */ - if (test_set_buffer_req(bh) && rw == WRITE) + if (buffer_ordered(bh) && (rw == WRITE)) + rw = WRITE_BARRIER; + + /* + * Only clear out a write error when rewriting, should this + * include WRITE_SYNC as well? + */ + if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER)) clear_buffer_write_io_error(bh); /* @@ -2792,7 +2804,14 @@ void submit_bh(int rw, struct buffer_hea bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; + bio_get(bio); submit_bio(rw, bio); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + + bio_put(bio); + return ret; } /** @@ -2851,20 +2870,30 @@ void ll_rw_block(int rw, int nr, struct /* * For a data-integrity writeout, we need to wait upon any in-progress I/O - * and then start new I/O and then wait upon it. + * and then start new I/O and then wait upon it. The caller must have a ref on + * the buffer_head. */ -void sync_dirty_buffer(struct buffer_head *bh) +int sync_dirty_buffer(struct buffer_head *bh) { + int ret = 0; + WARN_ON(atomic_read(&bh->b_count) < 1); lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, bh); + ret = submit_bh(WRITE, bh); wait_on_buffer(bh); + if (buffer_eopnotsupp(bh)) { + clear_buffer_eopnotsupp(bh); + ret = -EOPNOTSUPP; + } + if (!ret && !buffer_uptodate(bh)) + ret = -EIO; } else { unlock_buffer(bh); } + return ret; } /* Index: linux-2.6.8.1-ck9/fs/ext3/super.c =================================================================== --- linux-2.6.8.1-ck9.orig/fs/ext3/super.c 2004-08-15 14:08:15.000000000 +1000 +++ linux-2.6.8.1-ck9/fs/ext3/super.c 2004-10-02 11:38:03.040993760 +1000 @@ -587,7 +587,7 @@ enum { Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, - Opt_ignore, Opt_err, + Opt_ignore, Opt_barrier, Opt_err, }; static match_table_t tokens = { @@ -632,6 +632,7 @@ static match_table_t tokens = { {Opt_ignore, "noquota"}, {Opt_ignore, "quota"}, {Opt_ignore, "usrquota"}, + {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; @@ -897,6 +898,14 @@ clear_qf_name: case Opt_abort: set_opt(sbi->s_mount_opt, ABORT); break; + case Opt_barrier: + if (match_int(&args[0], &option)) + return 0; + if (option) + set_opt(sbi->s_mount_opt, BARRIER); + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; case Opt_ignore: break; default: @@ -1599,16 +1608,23 @@ out_fail: * initial mount, once the journal has been initialised but before we've * done any recovery; and again on any subsequent remount. */ -static void ext3_init_journal_params(struct ext3_sb_info *sbi, - journal_t *journal) +static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) { + struct ext3_sb_info *sbi = EXT3_SB(sb); + if (sbi->s_commit_interval) journal->j_commit_interval = sbi->s_commit_interval; /* We could also set up an ext3-specific default for the commit * interval here, but for now we'll just fall back to the jbd * default. */ -} + spin_lock(&journal->j_state_lock); + if (test_opt(sb, BARRIER)) + journal->j_flags |= JFS_BARRIER; + else + journal->j_flags &= ~JFS_BARRIER; + spin_unlock(&journal->j_state_lock); +} static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum) { @@ -1646,7 +1662,7 @@ static journal_t *ext3_get_journal(struc return NULL; } journal->j_private = sb; - ext3_init_journal_params(EXT3_SB(sb), journal); + ext3_init_journal_params(sb, journal); return journal; } @@ -1731,7 +1747,7 @@ static journal_t *ext3_get_dev_journal(s goto out_journal; } EXT3_SB(sb)->journal_bdev = bdev; - ext3_init_journal_params(EXT3_SB(sb), journal); + ext3_init_journal_params(sb, journal); return journal; out_journal: journal_destroy(journal); @@ -2028,7 +2044,7 @@ int ext3_remount (struct super_block * s es = sbi->s_es; - ext3_init_journal_params(sbi, sbi->s_journal); + ext3_init_journal_params(sb, sbi->s_journal); if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) Index: linux-2.6.8.1-ck9/fs/jbd/commit.c =================================================================== --- linux-2.6.8.1-ck9.orig/fs/jbd/commit.c 2004-10-02 11:37:43.120106887 +1000 +++ linux-2.6.8.1-ck9/fs/jbd/commit.c 2004-10-02 11:38:03.041993604 +1000 @@ -645,10 +645,38 @@ wait_for_iobuf: JBUFFER_TRACE(descriptor, "write commit block"); { struct buffer_head *bh = jh2bh(descriptor); + int ret; + int barrier_done = 0; set_buffer_dirty(bh); - sync_dirty_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) + if (journal->j_flags & JFS_BARRIER) { + set_buffer_ordered(bh); + barrier_done = 1; + } + ret = sync_dirty_buffer(bh); + /* is it possible for another commit to fail at roughly + * the same time as this one? If so, we don't want to + * trust the barrier flag in the super, but instead want + * to remember if we sent a barrier request + */ + if (ret == -EOPNOTSUPP && barrier_done) { + char b[BDEVNAME_SIZE]; + + printk(KERN_WARNING + "JBD: barrier-based sync failed on %s - " + "disabling barriers\n", + bdevname(journal->j_dev, b)); + spin_lock(&journal->j_state_lock); + journal->j_flags &= ~JFS_BARRIER; + spin_unlock(&journal->j_state_lock); + + /* And try again, without the barrier */ + clear_buffer_ordered(bh); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + ret = sync_dirty_buffer(bh); + } + if (unlikely(ret == -EIO)) err = -EIO; put_bh(bh); /* One for getblk() */ journal_put_journal_head(descriptor); Index: linux-2.6.8.1-ck9/fs/reiserfs/file.c =================================================================== --- linux-2.6.8.1-ck9.orig/fs/reiserfs/file.c 2004-08-15 14:08:17.000000000 +1000 +++ linux-2.6.8.1-ck9/fs/reiserfs/file.c 2004-10-02 11:38:03.056991260 +1000 @@ -89,15 +89,16 @@ static int reiserfs_sync_file( ) { struct inode * p_s_inode = p_s_dentry->d_inode; int n_err; - - reiserfs_write_lock(p_s_inode->i_sb); + int barrier_done; if (!S_ISREG(p_s_inode->i_mode)) BUG (); - n_err = sync_mapping_buffers(p_s_inode->i_mapping) ; - reiserfs_commit_for_inode(p_s_inode) ; + reiserfs_write_lock(p_s_inode->i_sb); + barrier_done = reiserfs_commit_for_inode(p_s_inode); reiserfs_write_unlock(p_s_inode->i_sb); + if (barrier_done != 1) + blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL); return ( n_err < 0 ) ? -EIO : 0; } Index: linux-2.6.8.1-ck9/fs/reiserfs/journal.c =================================================================== --- linux-2.6.8.1-ck9.orig/fs/reiserfs/journal.c 2004-08-15 14:08:17.000000000 +1000 +++ linux-2.6.8.1-ck9/fs/reiserfs/journal.c 2004-10-02 11:38:03.071988917 +1000 @@ -127,6 +127,12 @@ static int reiserfs_clean_and_file_buffe return 0 ; } +static void disable_barrier(struct super_block *s) +{ + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH); + printk("reiserfs: disabling flush barriers on %s\n", reiserfs_bdevname(s)); +} + static struct reiserfs_bitmap_node * allocate_bitmap_node(struct super_block *p_s_sb) { struct reiserfs_bitmap_node *bn ; @@ -640,6 +646,26 @@ static void submit_ordered_buffer(struct submit_bh(WRITE, bh) ; } +static int submit_barrier_buffer(struct buffer_head *bh) { + get_bh(bh) ; + bh->b_end_io = reiserfs_end_ordered_io; + clear_buffer_dirty(bh) ; + if (!buffer_uptodate(bh)) + BUG(); + return submit_bh(WRITE_BARRIER, bh) ; +} + +static void check_barrier_completion(struct super_block *s, + struct buffer_head *bh) { + if (buffer_eopnotsupp(bh)) { + clear_buffer_eopnotsupp(bh); + disable_barrier(s); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } +} + #define CHUNK_SIZE 32 struct buffer_chunk { struct buffer_head *bh[CHUNK_SIZE]; @@ -909,6 +935,7 @@ static int flush_commit_list(struct supe int bn ; struct buffer_head *tbh = NULL ; unsigned long trans_id = jl->j_trans_id; + int barrier = 0; reiserfs_check_lock_depth(s, "flush_commit_list") ; @@ -973,7 +1000,20 @@ static int flush_commit_list(struct supe } atomic_dec(&SB_JOURNAL(s)->j_async_throttle); - /* wait on everything written so far before writing the commit */ + /* wait on everything written so far before writing the commit + * if we are in barrier mode, send the commit down now + */ + barrier = reiserfs_barrier_flush(s); + if (barrier) { + int ret; + lock_buffer(jl->j_commit_bh); + ret = submit_barrier_buffer(jl->j_commit_bh); + if (ret == -EOPNOTSUPP) { + set_buffer_uptodate(jl->j_commit_bh); + disable_barrier(s); + barrier = 0; + } + } for (i = 0 ; i < (jl->j_len + 1) ; i++) { bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ; @@ -995,10 +1035,15 @@ static int flush_commit_list(struct supe if (atomic_read(&(jl->j_commit_left)) != 1) BUG(); - if (buffer_dirty(jl->j_commit_bh)) - BUG(); - mark_buffer_dirty(jl->j_commit_bh) ; - sync_dirty_buffer(jl->j_commit_bh) ; + if (!barrier) { + if (buffer_dirty(jl->j_commit_bh)) + BUG(); + mark_buffer_dirty(jl->j_commit_bh) ; + sync_dirty_buffer(jl->j_commit_bh) ; + } else + wait_on_buffer(jl->j_commit_bh); + + check_barrier_completion(s, jl->j_commit_bh); if (!buffer_uptodate(jl->j_commit_bh)) { reiserfs_panic(s, "journal-615: buffer write failed\n") ; } @@ -1098,8 +1143,23 @@ static int _update_journal_header_block( jh->j_last_flush_trans_id = cpu_to_le32(trans_id) ; jh->j_first_unflushed_offset = cpu_to_le32(offset) ; jh->j_mount_id = cpu_to_le32(SB_JOURNAL(p_s_sb)->j_mount_id) ; - set_buffer_dirty(SB_JOURNAL(p_s_sb)->j_header_bh) ; - sync_dirty_buffer(SB_JOURNAL(p_s_sb)->j_header_bh) ; + + if (reiserfs_barrier_flush(p_s_sb)) { + int ret; + lock_buffer(SB_JOURNAL(p_s_sb)->j_header_bh); + ret = submit_barrier_buffer(SB_JOURNAL(p_s_sb)->j_header_bh); + if (ret == -EOPNOTSUPP) { + set_buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh); + disable_barrier(p_s_sb); + goto sync; + } + wait_on_buffer(SB_JOURNAL(p_s_sb)->j_header_bh); + check_barrier_completion(p_s_sb, SB_JOURNAL(p_s_sb)->j_header_bh); + } else { +sync: + set_buffer_dirty(SB_JOURNAL(p_s_sb)->j_header_bh) ; + sync_dirty_buffer(SB_JOURNAL(p_s_sb)->j_header_bh) ; + } if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) { reiserfs_warning (p_s_sb, "journal-837: IO error during journal replay"); return -EIO ; @@ -3184,11 +3244,16 @@ void reiserfs_update_inode_transaction(s REISERFS_I(inode)->i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ; } -static void __commit_trans_jl(struct inode *inode, unsigned long id, +/* + * returns -1 on error, 0 if no commits/barriers were done and 1 + * if a transaction was actually committed and the barrier was done + */ +static int __commit_trans_jl(struct inode *inode, unsigned long id, struct reiserfs_journal_list *jl) { struct reiserfs_transaction_handle th ; struct super_block *sb = inode->i_sb ; + int ret = 0; /* is it from the current transaction, or from an unknown transaction? */ if (id == SB_JOURNAL(sb)->j_trans_id) { @@ -3210,6 +3275,7 @@ static void __commit_trans_jl(struct ino } journal_end_sync(&th, sb, 1) ; + ret = 1; } else { /* this gets tricky, we have to make sure the journal list in @@ -3218,13 +3284,21 @@ static void __commit_trans_jl(struct ino */ flush_commit_only: if (journal_list_still_alive(inode->i_sb, id)) { + /* + * we only set ret to 1 when we know for sure + * the barrier hasn't been started yet on the commit + * block. + */ + if (atomic_read(&jl->j_commit_left) > 1) + ret = 1; flush_commit_list(sb, jl, 1) ; } } /* otherwise the list is gone, and long since committed */ + return ret; } -void reiserfs_commit_for_inode(struct inode *inode) { +int reiserfs_commit_for_inode(struct inode *inode) { unsigned long id = REISERFS_I(inode)->i_trans_id; struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; @@ -3237,7 +3311,7 @@ void reiserfs_commit_for_inode(struct in /* jl will be updated in __commit_trans_jl */ } - __commit_trans_jl(inode, id, jl); + return __commit_trans_jl(inode, id, jl); } void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, Index: linux-2.6.8.1-ck9/fs/reiserfs/super.c =================================================================== --- linux-2.6.8.1-ck9.orig/fs/reiserfs/super.c 2004-08-15 14:08:17.000000000 +1000 +++ linux-2.6.8.1-ck9/fs/reiserfs/super.c 2004-10-02 11:38:03.074988448 +1000 @@ -549,6 +549,13 @@ static const arg_desc_t logging_mode[] = {NULL, 0} }; +/* possible values for -o barrier= */ +static const arg_desc_t barrier_mode[] = { + {"none", 1<s_mount_opt &= ~all_barrier; + if (bits & flush) { + REISERFS_SB(s)->s_mount_opt |= flush; + printk("reiserfs: enabling write barrier flush mode\n"); + } else if (bits & none) { + REISERFS_SB(s)->s_mount_opt |= none; + printk("reiserfs: write barriers turned off\n"); + } + } +} + static void handle_attrs( struct super_block *s ) { struct reiserfs_super_block * rs; @@ -854,6 +879,8 @@ static int reiserfs_remount (struct supe safe_mask |= 1 << REISERFS_ATTRS; safe_mask |= 1 << REISERFS_XATTRS_USER; safe_mask |= 1 << REISERFS_POSIXACL; + safe_mask |= 1 << REISERFS_BARRIER_FLUSH; + safe_mask |= 1 << REISERFS_BARRIER_NONE; /* Update the bitmask, taking care to keep * the bits we're not allowed to change here */ @@ -900,6 +927,7 @@ static int reiserfs_remount (struct supe } handle_data_mode(s, mount_options); + handle_barrier_mode(s, mount_options); REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ; s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */ journal_begin(&th, s, 10) ; @@ -1413,6 +1441,9 @@ static int reiserfs_fill_super (struct s } else { reiserfs_info (s, "using writeback data mode\n"); } + if (reiserfs_barrier_flush(s)) { + printk("reiserfs: using flush barriers\n"); + } // set_device_ro(s->s_dev, 1) ; if( journal_init(s, jdev_name, old_format, commit_max_age) ) { Index: linux-2.6.8.1-ck9/include/linux/bio.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/linux/bio.h 2004-08-15 14:08:18.000000000 +1000 +++ linux-2.6.8.1-ck9/include/linux/bio.h 2004-10-02 11:38:03.074988448 +1000 @@ -121,6 +121,7 @@ struct bio { #define BIO_CLONED 4 /* doesn't own data */ #define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_USER_MAPPED 6 /* contains user pages */ +#define BIO_EOPNOTSUPP 7 /* not supported */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* @@ -160,6 +161,8 @@ struct bio { #define bio_data(bio) (page_address(bio_page((bio))) + bio_offset((bio))) #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) #define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC)) +#define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST)) +#define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) /* * will die Index: linux-2.6.8.1-ck9/include/linux/blkdev.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/linux/blkdev.h 2004-08-15 14:08:18.000000000 +1000 +++ linux-2.6.8.1-ck9/include/linux/blkdev.h 2004-10-02 11:38:03.075988292 +1000 @@ -195,6 +195,8 @@ enum rq_flag_bits { __REQ_PM_SUSPEND, /* suspend request */ __REQ_PM_RESUME, /* resume request */ __REQ_PM_SHUTDOWN, /* shutdown request */ + __REQ_BAR_PREFLUSH, /* barrier pre-flush done */ + __REQ_BAR_POSTFLUSH, /* barrier post-flush */ __REQ_NR_BITS, /* stops here */ }; @@ -220,6 +222,8 @@ enum rq_flag_bits { #define REQ_PM_SUSPEND (1 << __REQ_PM_SUSPEND) #define REQ_PM_RESUME (1 << __REQ_PM_RESUME) #define REQ_PM_SHUTDOWN (1 << __REQ_PM_SHUTDOWN) +#define REQ_BAR_PREFLUSH (1 << __REQ_BAR_PREFLUSH) +#define REQ_BAR_POSTFLUSH (1 << __REQ_BAR_POSTFLUSH) /* * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME @@ -248,6 +252,7 @@ typedef void (unplug_fn) (request_queue_ struct bio_vec; typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *); typedef void (activity_fn) (void *data, int rw); +typedef int (issue_flush_fn) (request_queue_t *, struct gendisk *, sector_t *); enum blk_queue_state { Queue_down, @@ -290,6 +295,7 @@ struct request_queue unplug_fn *unplug_fn; merge_bvec_fn *merge_bvec_fn; activity_fn *activity_fn; + issue_flush_fn *issue_flush_fn; /* * Auto-unplugging state @@ -373,6 +379,7 @@ struct request_queue #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ #define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */ #define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */ +#define QUEUE_FLAG_ORDERED 8 /* supports ordered writes */ #define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) @@ -390,6 +397,10 @@ struct request_queue #define blk_pm_request(rq) \ ((rq)->flags & (REQ_PM_SUSPEND | REQ_PM_RESUME)) +#define blk_barrier_rq(rq) ((rq)->flags & REQ_HARDBARRIER) +#define blk_barrier_preflush(rq) ((rq)->flags & REQ_BAR_PREFLUSH) +#define blk_barrier_postflush(rq) ((rq)->flags & REQ_BAR_POSTFLUSH) + #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) #define rq_data_dir(rq) ((rq)->flags & 1) @@ -560,6 +571,14 @@ extern void end_that_request_last(struct extern int process_that_request_first(struct request *, unsigned int); extern void end_request(struct request *req, int uptodate); +/* + * end_that_request_first/chunk() takes an uptodate argument. we account + * any value <= as an io error. 0 means -EIO for compatability reasons, + * any other < 0 value is the direct error type. An uptodate value of + * 1 indicates successful io completion + */ +#define end_io_error(uptodate) (unlikely((uptodate) <= 0)) + static inline void blkdev_dequeue_request(struct request *req) { BUG_ON(list_empty(&req->queuelist)); @@ -588,6 +607,9 @@ extern void blk_queue_prep_rq(request_qu extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *); extern void blk_queue_dma_alignment(request_queue_t *, int); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); +extern void blk_queue_ordered(request_queue_t *, int); +extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *); +extern int blkdev_scsi_issue_flush_fn(request_queue_t *, struct gendisk *, sector_t *); extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); @@ -616,6 +638,7 @@ extern long blk_congestion_wait(int rw, extern void blk_rq_bio_prep(request_queue_t *, struct request *, struct bio *); extern void blk_rq_prep_restart(struct request *); +extern int blkdev_issue_flush(struct block_device *, sector_t *); #define MAX_PHYS_SEGMENTS 128 #define MAX_HW_SEGMENTS 128 Index: linux-2.6.8.1-ck9/include/linux/buffer_head.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/linux/buffer_head.h 2004-06-16 17:35:45.000000000 +1000 +++ linux-2.6.8.1-ck9/include/linux/buffer_head.h 2004-10-02 11:38:03.076988135 +1000 @@ -26,6 +26,8 @@ enum bh_state_bits { BH_Delay, /* Buffer is not yet allocated on disk */ BH_Boundary, /* Block is followed by a discontiguity */ BH_Write_EIO, /* I/O error on write */ + BH_Ordered, /* ordered write */ + BH_Eopnotsupp, /* operation not supported (barrier) */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities @@ -110,7 +112,9 @@ BUFFER_FNS(Async_Read, async_read) BUFFER_FNS(Async_Write, async_write) BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) -BUFFER_FNS(Write_EIO,write_io_error) +BUFFER_FNS(Write_EIO, write_io_error) +BUFFER_FNS(Ordered, ordered) +BUFFER_FNS(Eopnotsupp, eopnotsupp) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) #define touch_buffer(bh) mark_page_accessed(bh->b_page) @@ -172,8 +176,8 @@ void free_buffer_head(struct buffer_head void FASTCALL(unlock_buffer(struct buffer_head *bh)); void FASTCALL(__lock_buffer(struct buffer_head *bh)); void ll_rw_block(int, int, struct buffer_head * bh[]); -void sync_dirty_buffer(struct buffer_head *bh); -void submit_bh(int, struct buffer_head *); +int sync_dirty_buffer(struct buffer_head *bh); +int submit_bh(int, struct buffer_head *); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); Index: linux-2.6.8.1-ck9/include/linux/ext3_fs.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/linux/ext3_fs.h 2004-08-15 14:08:19.000000000 +1000 +++ linux-2.6.8.1-ck9/include/linux/ext3_fs.h 2004-10-02 11:38:03.076988135 +1000 @@ -324,6 +324,7 @@ struct ext3_inode { #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ +#define EXT3_MOUNT_BARRIER 0x10000 /* Use block barriers */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H Index: linux-2.6.8.1-ck9/include/linux/fs.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/linux/fs.h 2004-08-15 14:08:19.000000000 +1000 +++ linux-2.6.8.1-ck9/include/linux/fs.h 2004-10-02 11:38:03.077987979 +1000 @@ -88,6 +88,7 @@ extern int leases_enable, dir_notify_ena #define SPECIAL 4 /* For non-blockdevice requests in request queue */ #define READ_SYNC (READ | (1 << BIO_RW_SYNC)) #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) +#define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER)) #define SEL_IN 1 #define SEL_OUT 2 Index: linux-2.6.8.1-ck9/include/linux/ide.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/linux/ide.h 2004-08-15 14:08:19.000000000 +1000 +++ linux-2.6.8.1-ck9/include/linux/ide.h 2004-10-02 11:38:03.078987823 +1000 @@ -780,6 +780,7 @@ typedef struct ide_drive_s { u8 sect; /* "real" sectors per track */ u8 bios_head; /* BIOS/fdisk/LILO number of heads */ u8 bios_sect; /* BIOS/fdisk/LILO sectors per track */ + u8 doing_barrier; /* state, 1=currently doing flush */ unsigned int bios_cyl; /* BIOS/fdisk/LILO number of cyls */ unsigned int cyl; /* "real" number of cyls */ @@ -1293,6 +1294,11 @@ extern ide_startstop_t ide_do_reset (ide extern void ide_init_drive_cmd (struct request *rq); /* + * this function returns error location sector offset in case of a write error + */ +extern u64 ide_get_error_location(ide_drive_t *, char *); + +/* * "action" parameter type for ide_do_drive_cmd() below. */ typedef enum { @@ -1664,4 +1670,11 @@ extern struct semaphore ide_cfg_sem; extern struct bus_type ide_bus_type; +/* check if CACHE FLUSH (EXT) command is supported (bits defined in ATA-6) */ +#define ide_id_has_flush_cache(id) ((id)->cfs_enable_2 & 0x3000) + +/* some Maxtor disks have bit 13 defined incorrectly so check bit 10 too */ +#define ide_id_has_flush_cache_ext(id) \ + (((id)->cfs_enable_2 & 0x2400) == 0x2400) + #endif /* _IDE_H */ Index: linux-2.6.8.1-ck9/include/linux/jbd.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/linux/jbd.h 2004-08-15 14:08:19.000000000 +1000 +++ linux-2.6.8.1-ck9/include/linux/jbd.h 2004-10-02 11:38:03.079987667 +1000 @@ -840,6 +840,7 @@ struct journal_s #define JFS_ACK_ERR 0x004 /* The errno in the sb has been acked */ #define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */ #define JFS_LOADED 0x010 /* The journal superblock has been loaded */ +#define JFS_BARRIER 0x020 /* Use IDE barriers */ /* * Function declarations for the journaling transaction and buffer Index: linux-2.6.8.1-ck9/include/linux/reiserfs_fs.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/linux/reiserfs_fs.h 2004-08-15 14:08:19.000000000 +1000 +++ linux-2.6.8.1-ck9/include/linux/reiserfs_fs.h 2004-10-02 11:38:03.080987510 +1000 @@ -1777,7 +1777,8 @@ int reiserfs_end_persistent_transaction( int reiserfs_commit_page(struct inode *inode, struct page *page, unsigned from, unsigned to); int reiserfs_flush_old_commits(struct super_block *); -void reiserfs_commit_for_inode(struct inode *) ; +int reiserfs_commit_for_inode(struct inode *) ; +int reiserfs_inode_needs_commit(struct inode *) ; void reiserfs_update_inode_transaction(struct inode *) ; void reiserfs_wait_on_write_block(struct super_block *s) ; void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ; Index: linux-2.6.8.1-ck9/include/linux/reiserfs_fs_sb.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/linux/reiserfs_fs_sb.h 2004-06-16 17:35:46.000000000 +1000 +++ linux-2.6.8.1-ck9/include/linux/reiserfs_fs_sb.h 2004-10-02 11:38:03.081987354 +1000 @@ -444,6 +444,8 @@ enum reiserfs_mount_options { REISERFS_XATTRS, REISERFS_XATTRS_USER, REISERFS_POSIXACL, + REISERFS_BARRIER_NONE, + REISERFS_BARRIER_FLUSH, REISERFS_TEST1, REISERFS_TEST2, @@ -473,6 +475,8 @@ enum reiserfs_mount_options { #define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER)) #define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL)) #define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s)) +#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE)) +#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH)) void reiserfs_file_buffer (struct buffer_head * bh, int list); extern struct file_system_type reiserfs_fs_type; Index: linux-2.6.8.1-ck9/include/scsi/scsi_driver.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/scsi/scsi_driver.h 2004-03-11 21:29:26.000000000 +1100 +++ linux-2.6.8.1-ck9/include/scsi/scsi_driver.h 2004-10-02 11:38:03.082987198 +1000 @@ -13,6 +13,7 @@ struct scsi_driver { int (*init_command)(struct scsi_cmnd *); void (*rescan)(struct device *); + int (*issue_flush)(struct device *, sector_t *); }; #define to_scsi_driver(drv) \ container_of((drv), struct scsi_driver, gendrv)