From: Kiyoshi Ueda This patch adds a service time oriented dynamic load balancer, dm-service-time, which selects the path with the shortest estimated service time for the incoming I/O. The service time is estimated by dividing the in-flight I/O size by a performance value of each path. The performance value can be given as a table argument at the table loading time. If no performance value is given, all paths are considered equal. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- Documentation/device-mapper/dm-service-time.txt | 91 ++++++ drivers/md/Kconfig | 10 drivers/md/Makefile | 1 drivers/md/dm-service-time.c | 339 ++++++++++++++++++++++++ 4 files changed, 441 insertions(+) Index: linux-2.6.30/Documentation/device-mapper/dm-service-time.txt =================================================================== --- /dev/null +++ linux-2.6.30/Documentation/device-mapper/dm-service-time.txt @@ -0,0 +1,91 @@ +dm-service-time +=============== + +dm-service-time is a path selector module for device-mapper targets, +which selects a path with the shortest estimated service time for +the incoming I/O. + +The service time for each path is estimated by dividing the total size +of in-flight I/Os on a path with the performance value of the path. +The performance value is a relative throughput value among all paths +in a path-group, and it can be specified as a table argument. + +The path selector name is 'service-time'. + +Table parameters for each path: [ []] + : The number of I/Os to dispatch using the selected + path before switching to the next path. + If not given, internal default is used. To check + the default value, see the activated table. + : The relative throughput value of the path + among all paths in the path-group. + The valid range is 0-100. + If not given, minimum value '1' is used. + If '0' is given, the path isn't selected while + other paths having a positive value are available. + +Status for each path: \ + + : 'A' if the path is active, 'F' if the path is failed. + : The number of path failures. + : The size of in-flight I/Os on the path. + : The relative throughput value of the path + among all paths in the path-group. + + +Algorithm +========= + +dm-service-time adds the I/O size to 'in-flight-size' when the I/O is +dispatched and substracts when completed. +Basically, dm-service-time selects a path having minimum service time +which is calculated by: + + ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput' + +However, some optimizations below are used to reduce the calculation +as much as possible. + + 1. If the paths have the same 'relative_throughput', skip + the division and just compare the 'in-flight-size'. + + 2. If the paths have the same 'in-flight-size', skip the division + and just compare the 'relative_throughput'. + + 3. If some paths have non-zero 'relative_throughput' and others + have zero 'relative_throughput', ignore those paths with zero + 'relative_throughput'. + +If such optimizations can't be applied, calculate service time, and +compare service time. +If calculated service time is equal, the path having maximum +'relative_throughput' may be better. So compare 'relative_throughput' +then. + + +Examples +======== +In case that 2 paths (sda and sdb) are used with repeat_count == 128 +and sda has an average throughput 1GB/s and sdb has 4GB/s, +'relative_throughput' value may be '1' for sda and '4' for sdb. + +# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \ + dmsetup create test +# +# dmsetup table +test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4 +# +# dmsetup status +test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4 + + +Or '2' for sda and '8' for sdb would be also true. + +# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \ + dmsetup create test +# +# dmsetup table +test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8 +# +# dmsetup status +test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8 Index: linux-2.6.30/drivers/md/Kconfig =================================================================== --- linux-2.6.30.orig/drivers/md/Kconfig +++ linux-2.6.30/drivers/md/Kconfig @@ -258,6 +258,16 @@ config DM_MULTIPATH_QL If unsure, say N. +config DM_MULTIPATH_ST + tristate "I/O Path Selector based on the service time" + depends on DM_MULTIPATH + ---help--- + This path selector is a dynamic load balancer which selects + the path expected to complete the incoming I/O in the shortest + time. + + If unsure, say N. + config DM_DELAY tristate "I/O delaying target (EXPERIMENTAL)" depends on BLK_DEV_DM && EXPERIMENTAL Index: linux-2.6.30/drivers/md/Makefile =================================================================== --- linux-2.6.30.orig/drivers/md/Makefile +++ linux-2.6.30/drivers/md/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o +obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_ZERO) += dm-zero.o Index: linux-2.6.30/drivers/md/dm-service-time.c =================================================================== --- /dev/null +++ linux-2.6.30/drivers/md/dm-service-time.c @@ -0,0 +1,339 @@ +/* + * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. + * + * Module Author: Kiyoshi Ueda + * + * This file is released under the GPL. + * + * Throughput oriented path selector. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#define DM_MSG_PREFIX "multipath service-time" +#define ST_MIN_IO 1 +#define ST_MAX_RELATIVE_THROUGHPUT 100 +#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 +#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) +#define ST_VERSION "0.2.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned repeat_count; + unsigned relative_throughput; + atomic_t in_flight_size; /* Total size of in-flight I/Os */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + } + + return s; +} + +static int st_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void st_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int st_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct path_info *pi; + + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), + pi->relative_throughput); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u %u ", pi->repeat_count, + pi->relative_throughput); + break; + } + } + + return sz; +} + +static int st_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned repeat_count = ST_MIN_IO; + unsigned relative_throughput = 1; + + /* + * Arguments: [ []] + * : The number of I/Os before switching path. + * If not given, default (ST_MIN_IO) is used. + * : The relative throughput value of + * the path among all paths in the path-group. + * The valid range: 0- + * If not given, minimum value '1' is used. + * If '0' is given, the path isn't selected while + * other paths having a positive value are + * available. + */ + if (argc > 2) { + *error = "service-time ps: incorrect number of arguments"; + return -EINVAL; + } + + if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { + *error = "service-time ps: invalid repeat count"; + return -EINVAL; + } + + if ((argc == 2) && + (sscanf(argv[1], "%u", &relative_throughput) != 1 || + relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { + *error = "service-time ps: invalid relative_throughput value"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "service-time ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + pi->relative_throughput = relative_throughput; + atomic_set(&pi->in_flight_size, 0); + + path->pscontext = pi; + + list_add_tail(&pi->list, &s->valid_paths); + + return 0; +} + +static void st_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move(&pi->list, &s->failed_paths); +} + +static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move_tail(&pi->list, &s->valid_paths); + + return 0; +} + +/* + * Compare the estimated service time of 2 paths, pi1 and pi2, + * for the incoming I/O. + * + * Returns: + * < 0 : pi1 is better + * 0 : no difference between pi1 and pi2 + * > 0 : pi2 is better + * + * Description: + * Basically, the service time is estimated by: + * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' + * To reduce the calculation, some optimizations are made. + * (See comments inline) + */ +static int st_compare_load(struct path_info *pi1, struct path_info *pi2, + size_t incoming) +{ + size_t sz1, sz2, st1, st2; + + sz1 = atomic_read(&pi1->in_flight_size); + sz2 = atomic_read(&pi2->in_flight_size); + + /* + * Case 1: Both have same throughput value. Choose less loaded path. + */ + if (pi1->relative_throughput == pi2->relative_throughput) + return sz1 - sz2; + + /* + * Case 2a: Both have same load. Choose higher throughput path. + * Case 2b: One path has no throughput value. Choose the other one. + */ + if (sz1 == sz2 || + !pi1->relative_throughput || !pi2->relative_throughput) + return pi2->relative_throughput - pi1->relative_throughput; + + /* + * Case 3: Calculate service time. Choose faster path. + * Service time using pi1: + * st1 = (sz1 + incoming) / pi1->relative_throughput + * Service time using pi2: + * st2 = (sz2 + incoming) / pi2->relative_throughput + * + * To avoid the division, transform the expression to use + * multiplication. + * Because ->relative_throughput > 0 here, if st1 < st2, + * the expressions below are the same meaning: + * (sz1 + incoming) / pi1->relative_throughput < + * (sz2 + incoming) / pi2->relative_throughput + * (sz1 + incoming) * pi2->relative_throughput < + * (sz2 + incoming) * pi1->relative_throughput + * So use the later one. + */ + sz1 += incoming; + sz2 += incoming; + if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || + sz2 >= ST_MAX_INFLIGHT_SIZE)) { + /* + * Size may be too big for multiplying pi->relative_throughput + * and overflow. + * To avoid the overflow and mis-selection, shift down both. + */ + sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; + sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; + } + st1 = sz1 * pi2->relative_throughput; + st2 = sz2 * pi1->relative_throughput; + if (st1 != st2) + return st1 - st2; + + /* + * Case 4: Service time is equal. Choose higher throughput path. + */ + return pi2->relative_throughput - pi1->relative_throughput; +} + +static struct dm_path *st_select_path(struct path_selector *ps, + unsigned *repeat_count, size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + + if (list_empty(&s->valid_paths)) + return NULL; + + /* Change preferred (first in list) path to evenly balance. */ + list_move_tail(s->valid_paths.next, &s->valid_paths); + + list_for_each_entry(pi, &s->valid_paths, list) + if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) + best = pi; + + if (!best) + return NULL; + + *repeat_count = best->repeat_count; + + return best->path; +} + +static int st_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_add(nr_bytes, &pi->in_flight_size); + + return 0; +} + +static int st_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_sub(nr_bytes, &pi->in_flight_size); + + return 0; +} + +static struct path_selector_type st_ps = { + .name = "service-time", + .module = THIS_MODULE, + .table_args = 2, + .info_args = 2, + .create = st_create, + .destroy = st_destroy, + .status = st_status, + .add_path = st_add_path, + .fail_path = st_fail_path, + .reinstate_path = st_reinstate_path, + .select_path = st_select_path, + .start_io = st_start_io, + .end_io = st_end_io, +}; + +static int __init dm_st_init(void) +{ + int r = dm_register_path_selector(&st_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " ST_VERSION " loaded"); + + return r; +} + +static void __exit dm_st_exit(void) +{ + int r = dm_unregister_path_selector(&st_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_st_init); +module_exit(dm_st_exit); + +MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); +MODULE_AUTHOR("Kiyoshi Ueda "); +MODULE_LICENSE("GPL");