diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 32d0b87..e8115a1 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -923,7 +923,7 @@ static int multipath_map(struct dm_target *ti, struct request *clone, /* * Take a path out of use. */ -static int fail_path(struct pgpath *pgpath) +static int __fail_path(struct pgpath *pgpath, int error) { unsigned long flags; struct multipath *m = pgpath->pg->m; @@ -933,6 +933,23 @@ static int fail_path(struct pgpath *pgpath) if (!pgpath->is_active) goto out; + /* + * We know we want to fail the path for -ENOLINK. + * + * We currently also must fail the path for -EIO, + * because we do not know if a deivce error is + * from the device or from a controller. + * + * For -ETIME, I do not know if we want to fail the path + * or not. If a cmd timedout and the scsi layer could not + * recover then the path will be offlined and so it will + * be failed eventually. However, if there is just a + * flakey controller or path, then a cmd could be timing + * out and causing slow downs. If we fail the path though, + * the path tester test io will probably eventually succeed + * and online the path and we will probably end up bouncing + * around. + */ DMWARN("Failing path %s.", pgpath->path.dev->name); pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); @@ -948,7 +965,9 @@ static int fail_path(struct pgpath *pgpath) pgpath->path.dev->name, m->nr_valid_paths); schedule_work(&m->trigger_event); - queue_work(kmultipathd, &pgpath->deactivate_path); + + if (error == -ETIME) + queue_work(kmultipathd, &pgpath->deactivate_path); out: spin_unlock_irqrestore(&m->lock, flags); @@ -956,6 +975,11 @@ out: return 0; } +static int fail_path(struct pgpath *pgpath) +{ + return __fail_path(pgpath, -EIO); +} + /* * Reinstate a previously-failed path */ @@ -1214,11 +1238,15 @@ static int do_end_io(struct multipath *m, struct request *clone, if (!error && !clone->errors) return 0; /* I/O complete */ - if (error == -EOPNOTSUPP) + /* + * There is no point or it may not be safe in trying new paths + * on fatal device errors so fail upwards immediately. + */ + if (error == -EOPNOTSUPP || error == -ENXIO) return error; if (mpio->pgpath) - fail_path(mpio->pgpath); + __fail_path(mpio->pgpath, error); spin_lock_irqsave(&m->lock, flags); if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m)) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 5987da8..838a3ec 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -791,12 +791,49 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) error = -EIO; - if (host_byte(result) == DID_RESET) { - /* Third party bus reset or reset for error recovery - * reasons. Just retry the command and see what - * happens. - */ - action = ACTION_RETRY; + if (host_byte(cmd->result)) { + switch (host_byte(cmd->result)) { + case DID_TRANSPORT_DISRUPTED: + case DID_TRANSPORT_FAILFAST: + case DID_NO_CONNECT: + case DID_BUS_BUSY: + error = -ENOLINK; + description = "Transport failure"; + action = ACTION_FAIL; + break; + case DID_RESET: + /* Third party bus reset or reset for error recovery + * reasons. Just retry the command and see what + * happens. + */ + action = ACTION_RETRY; + break; + default: + description = "Unhandled host byte error code"; + action = ACTION_FAIL; + } + } else if (driver_byte(cmd->result)) { + action = ACTION_FAIL; + + switch (driver_byte(cmd->result)) { + case DRIVER_TIMEOUT: + error = -ETIME; + description = "Command timed out"; + break; + default: + description = "Unhandled driver byte error code"; + } + } else if (status_byte(cmd->result)) { + action = ACTION_FAIL; + + switch (status_byte(cmd->result)) { + case RESERVATION_CONFLICT: + error = -ENXIO; + description = "Nonretryable device error"; + break; + default: + description = "Unhandled status byte error code"; + } } else if (sense_valid && !sense_deferred) { switch (sshdr.sense_key) { case UNIT_ATTENTION: