GIT b3b6a155c2b85d436b192d74e459f837eab0944e git+ssh://master.kernel.org/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git#cfq

commit b3b6a155c2b85d436b192d74e459f837eab0944e
Author: Jens Axboe <axboe@suse.de>
Date:   Tue Mar 28 13:05:34 2006 +0200

    [BLOCK] cfq-iosched: change from escalator to staircase type service
    
    Currently, the priority RR algorithm in CFQ behaves like a see-saw,
    where the swing extends one extra prio level per iteration until they
    are all covered (then it starts over). This works fine for bandwidth
    distribution, but not so well for latencies. Writing a test model for
    this algorithm, gives the following computed latencies for one process
    running at each (of 8) priority levels:
    
    prio0: 30.01% disk time,  700msec max latency (tested  710msec)
    prio1: 23.34% disk time,  900msec max latency (tested  728msec)
    prio2: 17.50% disk time, 1260msec max latency (tested 1084msec)
    prio3: 12.50% disk time, 1760msec max latency (tested 1581msec)
    prio4:  8.33% disk time, 2380msec max latency (tested 2228msec)
    prio5:  5.00% disk time, 3100msec max latency (tested 2922msec)
    prio6:  2.50% disk time, 3900msec max latency (tested 3730msec)
    prio7:  0.83% disk time, 4760msec max latency (tested 4588msec)
    
    'tested' shows actual latencies measured with 'fio', reality matches
    the theory. So far, so good. If we simulate 3 processes at each
    prio level, the max latency for prio0 rises to 2460msec. prio4 (which
    is the default for a process) rises to 7340msec!
    
    Looking at a more generic staircase model where the climb down the
    stairs (priority levels) and let the dynamic priority of a process
    increase until it hits the top and then slide back to its original
    prio, could be more interesting from a latency POV. Simulating that
    shows (for 1 process at each level):
    
    prio0: 27.85% disk time,  400msec max latency (tested  405msec)
    prio1: 22.15% disk time,  420msec max latency (tested  429msec)
    prio2: 17.09% disk time,  620msec max latency (tested  662msec)
    prio3: 12.66% disk time, 1080msec max latency (tested 1126msec)
    prio4:  8.86% disk time, 1600msec max latency (tested 1641msec)
    prio5:  5.70% disk time, 2140msec max latency (tested 2182msec)
    prio6:  3.16% disk time, 2660msec max latency (tested 2669msec)
    prio7:  2.53% disk time, 2800msec max latency (tested 2803msec)
    
    Latency is almost halved, while the aggregate and individually
    measured throughput is the same. Service distribution is a little
    different from the old algorithm, however not very much. For 3
    processes at each level, prio0 has a max latency of 1440msec and
    6120msec for prio4.
    
    As a bonus, we drop one list from cfqd and make the code a lot more
    readable.
    
    Signed-off-by: Jens Axboe <axboe@suse.de>

---
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 2540dfa..c7b88e2 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -108,8 +108,9 @@ struct cfq_data {
 	 * rr list of queues with requests and the count of them
 	 */
 	struct list_head rr_list[CFQ_PRIO_LISTS];
-	struct list_head busy_rr;
 	struct list_head cur_rr;
+	unsigned short cur_prio;
+
 	struct list_head idle_rr;
 	unsigned int busy_queues;
 
@@ -145,7 +146,6 @@ struct cfq_data {
 
 	struct cfq_queue *active_queue;
 	struct cfq_io_context *active_cic;
-	int cur_prio, cur_end_prio;
 	unsigned int dispatch_slice;
 
 	struct timer_list idle_class_timer;
@@ -204,8 +204,13 @@ struct cfq_queue {
 	int on_dispatch[2];
 
 	/* io prio of this group */
-	unsigned short ioprio, org_ioprio;
-	unsigned short ioprio_class, org_ioprio_class;
+	unsigned short ioprio_class, ioprio;
+
+	/* current dynamic stair priority */
+	unsigned short dyn_ioprio;
+
+	/* same as real ioprio, except if queue has been elevated */
+	unsigned short org_ioprio_class, org_ioprio;
 
 	/* various state flags, see below */
 	unsigned int flags;
@@ -484,25 +489,13 @@ static void cfq_resort_rr_list(struct cf
 		list = &cfqd->cur_rr;
 	else if (cfq_class_idle(cfqq))
 		list = &cfqd->idle_rr;
-	else {
-		/*
-		 * if cfqq has requests in flight, don't allow it to be
-		 * found in cfq_set_active_queue before it has finished them.
-		 * this is done to increase fairness between a process that
-		 * has lots of io pending vs one that only generates one
-		 * sporadically or synchronously
-		 */
-		if (cfq_cfqq_dispatched(cfqq))
-			list = &cfqd->busy_rr;
-		else
-			list = &cfqd->rr_list[cfqq->ioprio];
-	}
+	else
+		list = &cfqd->rr_list[cfqq->dyn_ioprio];
 
 	/*
-	 * if queue was preempted, just add to front to be fair. busy_rr
-	 * isn't sorted.
+	 * if queue was preempted, just add to front to be fair.
 	 */
-	if (preempted || list == &cfqd->busy_rr) {
+	if (preempted) {
 		list_add(&cfqq->cfq_list, list);
 		return;
 	}
@@ -514,6 +507,8 @@ static void cfq_resort_rr_list(struct cf
 	while ((entry = entry->prev) != list) {
 		struct cfq_queue *__cfqq = list_entry_cfqq(entry);
 
+		if (__cfqq->ioprio < cfqq->ioprio)
+			break;
 		if (!__cfqq->service_last)
 			break;
 		if (time_before(__cfqq->service_last, cfqq->service_last))
@@ -740,23 +735,49 @@ cfq_merged_requests(request_queue_t *q, 
 	cfq_remove_request(next);
 }
 
+/*
+ * Scale schedule slice based on io priority. Use the sync time slice only
+ * if a queue is marked sync and has sync io queued. A sync queue with async
+ * io only, should not get full sync slice length.
+ */
+static inline int
+cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
+	unsigned short prio = cfqq->dyn_ioprio;
+
+	WARN_ON(prio >= IOPRIO_BE_NR);
+
+	if (cfq_class_rt(cfqq))
+		prio = 0;
+
+	return base_slice + (base_slice / CFQ_SLICE_SCALE * (4 - prio));
+}
+
 static inline void
-__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	if (cfqq) {
-		/*
-		 * stop potential idle class queues waiting service
-		 */
-		del_timer(&cfqd->idle_class_timer);
+	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+}
 
-		cfqq->slice_start = jiffies;
-		cfqq->slice_end = 0;
-		cfqq->slice_left = 0;
-		cfq_clear_cfqq_must_alloc_slice(cfqq);
-		cfq_clear_cfqq_fifo_expire(cfqq);
-	}
+static inline int
+cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	const int base_rq = cfqd->cfq_slice_async_rq;
+	unsigned short prio = cfqq->dyn_ioprio;
 
-	cfqd->active_queue = cfqq;
+	WARN_ON(cfqq->dyn_ioprio >= IOPRIO_BE_NR);
+
+	if (cfq_class_rt(cfqq))
+		prio = 0;
+
+	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - prio));
+}
+
+static inline void cfq_prio_inc(unsigned short *p, unsigned int low_p)
+{
+	if (++(*p) == CFQ_PRIO_LISTS)
+		*p = low_p;
 }
 
 /*
@@ -788,6 +809,8 @@ __cfq_slice_expired(struct cfq_data *cfq
 	else
 		cfqq->slice_left = 0;
 
+	cfq_prio_inc(&cfqq->dyn_ioprio, cfqq->ioprio);
+
 	if (cfq_cfqq_on_rr(cfqq))
 		cfq_resort_rr_list(cfqq, preempted);
 
@@ -810,73 +833,58 @@ static inline void cfq_slice_expired(str
 		__cfq_slice_expired(cfqd, cfqq, preempted);
 }
 
-/*
- * 0
- * 0,1
- * 0,1,2
- * 0,1,2,3
- * 0,1,2,3,4
- * 0,1,2,3,4,5
- * 0,1,2,3,4,5,6
- * 0,1,2,3,4,5,6,7
- */
-static int cfq_get_next_prio_level(struct cfq_data *cfqd)
+static struct cfq_queue *cfq_get_next_cfqq(struct cfq_data *cfqd)
 {
-	int prio, wrap;
+	if (!cfqd->busy_queues)
+		return NULL;
 
-	prio = -1;
-	wrap = 0;
-	do {
-		int p;
+	if (list_empty(&cfqd->cur_rr)) {
+		unsigned short prio = cfqd->cur_prio;
+
+		do {
+			struct list_head *list = &cfqd->rr_list[prio];
 
-		for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
-			if (!list_empty(&cfqd->rr_list[p])) {
-				prio = p;
+			if (!list_empty(list)) {
+				list_splice_init(list, &cfqd->cur_rr);
 				break;
 			}
-		}
 
-		if (prio != -1)
-			break;
-		cfqd->cur_prio = 0;
-		if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
-			cfqd->cur_end_prio = 0;
-			if (wrap)
-				break;
-			wrap = 1;
-		}
-	} while (1);
+			cfq_prio_inc(&prio, 0);
+		} while (prio != cfqd->cur_prio);
 
-	if (unlikely(prio == -1))
-		return -1;
+		cfq_prio_inc(&cfqd->cur_prio, 0);
+	}
 
-	BUG_ON(prio >= CFQ_PRIO_LISTS);
+	if (!list_empty(&cfqd->cur_rr));
+		return list_entry_cfqq(cfqd->cur_rr.next);
 
-	list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
+	return NULL;
+}
 
-	cfqd->cur_prio = prio + 1;
-	if (cfqd->cur_prio > cfqd->cur_end_prio) {
-		cfqd->cur_end_prio = cfqd->cur_prio;
-		cfqd->cur_prio = 0;
-	}
-	if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
-		cfqd->cur_prio = 0;
-		cfqd->cur_end_prio = 0;
+static inline void
+__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	if (cfqq) {
+		WARN_ON(RB_EMPTY(&cfqq->sort_list));
+
+		/*
+		 * stop potential idle class queues waiting service
+		 */
+		del_timer(&cfqd->idle_class_timer);
+
+		cfqq->slice_start = jiffies;
+		cfqq->slice_end = 0;
+		cfqq->slice_left = 0;
+		cfq_clear_cfqq_must_alloc_slice(cfqq);
+		cfq_clear_cfqq_fifo_expire(cfqq);
 	}
 
-	return prio;
+	cfqd->active_queue = cfqq;
 }
 
 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 {
-	struct cfq_queue *cfqq = NULL;
-
-	/*
-	 * if current list is non-empty, grab first entry. if it is empty,
-	 * get next prio level and grab first entry then if any are spliced
-	 */
-	if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
-		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
+	struct cfq_queue *cfqq = cfq_get_next_cfqq(cfqd);
 
 	/*
 	 * if we have idle queues and no rt or be queues had pending
@@ -974,37 +982,6 @@ static inline struct cfq_rq *cfq_check_f
 }
 
 /*
- * Scale schedule slice based on io priority. Use the sync time slice only
- * if a queue is marked sync and has sync io queued. A sync queue with async
- * io only, should not get full sync slice length.
- */
-static inline int
-cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
-
-	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
-
-	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
-}
-
-static inline void
-cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
-}
-
-static inline int
-cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	const int base_rq = cfqd->cfq_slice_async_rq;
-
-	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
-
-	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
-}
-
-/*
  * get next queue for service
  */
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
@@ -1121,7 +1098,6 @@ cfq_forced_dispatch(struct cfq_data *cfq
 	for (i = 0; i < CFQ_PRIO_LISTS; i++)
 		dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
 
-	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
 	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
 	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
 
@@ -1361,6 +1337,11 @@ static void cfq_init_prio_data(struct cf
 	cfqq->org_ioprio = cfqq->ioprio;
 	cfqq->org_ioprio_class = cfqq->ioprio_class;
 
+	/*
+	 * start priority
+	 */
+	cfqq->dyn_ioprio = cfqq->ioprio;
+
 	if (cfq_cfqq_on_rr(cfqq))
 		cfq_resort_rr_list(cfqq, 0);
 
@@ -2234,7 +2215,6 @@ static int cfq_init_queue(request_queue_
 	for (i = 0; i < CFQ_PRIO_LISTS; i++)
 		INIT_LIST_HEAD(&cfqd->rr_list[i]);
 
-	INIT_LIST_HEAD(&cfqd->busy_rr);
 	INIT_LIST_HEAD(&cfqd->cur_rr);
 	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->empty_list);