GIT 06b3d63594025dd57c47daff8d7181d51e6a7a0b git+ssh://master.kernel.org/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git#for-akpm

commit 
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Fri Apr 20 20:45:39 2007 +0200

    cfq-iosched: tighten queue request overlap condition
    
    For tagged devices, allow overlap of requests if the idle window
    isn't enabled on the current active queue.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 79042f83e2944207067c0baf475c115aa0f96b6a
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon Apr 23 08:33:33 2007 +0200

    [PATCH] cfq-iosched: improve sync vs async workloads
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit f050ea93baf91d1910e76dbd80493c60b52831ce
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Thu Apr 19 14:32:26 2007 +0200

    cfq-iosched: never allow an async queue idling
    
    We don't enable it by default, don't let it get enabled during
    runtime.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 174a32e172df3971bcc0856bccbbb01744388d3c
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon Apr 23 08:26:36 2007 +0200

    [PATCH] cfq-iosched: get rid of ->dispatch_slice
    
    We can track it fairly accurately locally, let the slice handling
    take care of the rest.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 3dc69698edacbca860820c99aa88c92c50b59073
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon Apr 23 08:25:00 2007 +0200

    [PATCH] cfq-iosched: don't pass unused preemption variable around
    
    We don't use it anymore in the slice expiry handling.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit f1c7949d59b24f4e4b1d7c7ad65b592690004c3b
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Thu Apr 19 12:03:34 2007 +0200

    cfq-iosched: get rid of ->cur_rr and ->cfq_list
    
    It's only used for preemption now that the IDLE and RT queues also
    use the rbtree. If we pass an 'add_front' variable to
    cfq_service_tree_add(), we can set ->rb_key to 0 to force insertion
    at the front of the tree.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 39f509e5d280eeb89854430a143f71a6d9998042
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Fri Apr 20 14:18:00 2007 +0200

    cfq-iosched: slice offset should take ioprio into account
    
    Use the max_slice-cur_slice as the multipler for the insertion offset.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 1bc01cde64962da13a4aa35c21b097a5e06f5bb5
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Apr 18 20:53:30 2007 +0200

    cfq-iosched: style cleanups and comments
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit b294b6a3b51247c8734c82a39e167034dc6b209d
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Apr 18 20:13:32 2007 +0200

    cfq-iosched: sort IDLE queues into the rbtree
    
    Same treatment as the RT conversion, just put the sorted idle
    branch at the end of the tree.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 813a14b3a77007d20e0c132f832065244dbad5e4
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Apr 18 20:01:57 2007 +0200

    cfq-iosched: sort RT queues into the rbtree
    
    Currently CFQ does a linked insert into the current list for RT
    queues. We can just factor the class into the rb insertion,
    and then we don't have to treat RT queues in a special way. It's
    faster, too.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 2ab00cc411eb1bf725051ad28731dab7666760c8
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Fri Apr 20 14:28:40 2007 +0200

    cfq-iosched: speed up rbtree handling
    
    For cases where the rbtree is mainly used for sorting and min retrieval,
    a nice speedup of the rbtree code is to maintain a cache of the leftmost
    node in the tree.
    
    Also spotted in the CFS CPU scheduler code.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit e0d214f17c8b5266bc95f439a9572d658b79180e
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Fri Apr 20 14:27:50 2007 +0200

    cfq-iosched: rework the whole round-robin list concept
    
    Drawing on some inspiration from the CFS CPU scheduler design, overhaul
    the pending cfq_queue concept list management. Currently CFQ uses a
    doubly linked list per priority level for sorting and service uses.
    Kill those lists and maintain an rbtree of cfq_queue's, sorted by when
    to service them.
    
    This unfortunately means that the ionice levels aren't as strong
    anymore, will work on improving those later. We only scale the slice
    time now, not the number of times we service. This means that latency
    is better (for all priority levels), but that the distinction between
    the highest and lower levels aren't as big.
    
    The diffstat speaks for itself.
    
     cfq-iosched.c |  363 +++++++++++++++++---------------------------------
     1 file changed, 125 insertions(+), 238 deletions(-)
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 20933f039782cdd5f0c186992c338374925811e2
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Tue Apr 17 12:47:55 2007 +0200

    cfq-iosched: minor updates
    
    - Move the queue_new flag clear to when the queue is selected
    - Only select the non-first queue in cfq_get_best_queue(), if there's
      a substantial difference between the best and first.
    - Get rid of ->busy_rr
    - Only select a close cooperator, if the current queue is known to take
      a while to "think".
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit b0a3cfce039023cbb8e44c54cbaff351494531f9
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon Apr 23 08:23:25 2007 +0200

    [PATCH] cfq-iosched: development update
    
    - Implement logic for detecting cooperating processes, so we
      choose the best available queue whenever possible.
    
    - Improve residual slice time accounting.
    
    - Remove dead code: we no longer see async requests coming in on
      sync queues. That part was removed a long time ago. That means
      that we can also remove the difference between cfq_cfqq_sync()
      and cfq_cfqq_class_sync(), they are now indentical. And we can
      kill the on_dispatch array, just make it a counter.
    
    - Allow a process to go into the current list, if it hasn't been
      serviced in this scheduler tick yet.
    
    Possible future improvements including caching the cfqq lookup
    in cfq_close_cooperator(), so we don't have to look it up twice.
    cfq_get_best_queue() should just use that last decision instead
    of doing it again.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit a6f2a38df28fbf521203e1164ea624ebc1eb9bfb
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Feb 14 19:59:49 2007 +0100

    cfq-iosched: improve preemption for cooperating tasks
    
    When testing the syslet async io approach, I discovered that CFQ
    sometimes didn't perform as well as expected. cfq_should_preempt()
    needs to better check for cooperating tasks, so fix that by allowing
    preemption of an equal priority queue if the recently queued request
    is as good a candidate for IO as the one we are currently waiting for.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
 block/cfq-iosched.c |  676 +++++++++++++++++++++++++++------------------------
 1 files changed, 357 insertions(+), 319 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9e37971..8093733 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -26,7 +26,16 @@ static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
 
+/*
+ * grace period before allowing idle class to get disk access
+ */
 #define CFQ_IDLE_GRACE		(HZ / 10)
+
+/*
+ * below this threshold, we consider thinktime immediate
+ */
+#define CFQ_MIN_TT		(2)
+
 #define CFQ_SLICE_SCALE		(5)
 
 #define CFQ_KEY_ASYNC		(0)
@@ -36,9 +45,6 @@ #define CFQ_KEY_ASYNC		(0)
  */
 #define CFQ_QHASH_SHIFT		6
 #define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
-#define list_entry_qhash(entry)	hlist_entry((entry), struct cfq_queue, cfq_hash)
-
-#define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
 
 #define RQ_CIC(rq)		((struct cfq_io_context*)(rq)->elevator_private)
 #define RQ_CFQQ(rq)		((rq)->elevator_private2)
@@ -56,17 +62,23 @@ #define cfq_class_rt(cfqq)	((cfqq)->iopr
 #define ASYNC			(0)
 #define SYNC			(1)
 
-#define cfq_cfqq_dispatched(cfqq)	\
-	((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
-
-#define cfq_cfqq_class_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
-
-#define cfq_cfqq_sync(cfqq)		\
-	(cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
+#define cfq_cfqq_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
 
 #define sample_valid(samples)	((samples) > 80)
 
 /*
+ * Most of our rbtree usage is for sorting with min extraction, so
+ * if we cache the leftmost node we don't have to walk down the tree
+ * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
+ * move this into the elevator for the rq sorting as well.
+ */
+struct cfq_rb_root {
+	struct rb_root rb;
+	struct rb_node *left;
+};
+#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, }
+
+/*
  * Per block device queue structure
  */
 struct cfq_data {
@@ -75,10 +87,7 @@ struct cfq_data {
 	/*
 	 * rr list of queues with requests and the count of them
 	 */
-	struct list_head rr_list[CFQ_PRIO_LISTS];
-	struct list_head busy_rr;
-	struct list_head cur_rr;
-	struct list_head idle_rr;
+	struct cfq_rb_root service_tree;
 	unsigned int busy_queues;
 
 	/*
@@ -87,6 +96,7 @@ struct cfq_data {
 	struct hlist_head *cfq_hash;
 
 	int rq_in_driver;
+	int sync_flight;
 	int hw_tag;
 
 	/*
@@ -97,12 +107,10 @@ struct cfq_data {
 
 	struct cfq_queue *active_queue;
 	struct cfq_io_context *active_cic;
-	int cur_prio, cur_end_prio;
-	unsigned int dispatch_slice;
 
 	struct timer_list idle_class_timer;
 
-	sector_t last_sector;
+	sector_t last_position;
 	unsigned long last_end_request;
 
 	/*
@@ -117,6 +125,9 @@ struct cfq_data {
 	unsigned int cfq_slice_idle;
 
 	struct list_head cic_list;
+
+	sector_t new_seek_mean;
+	u64 new_seek_total;
 };
 
 /*
@@ -131,8 +142,10 @@ struct cfq_queue {
 	struct hlist_node cfq_hash;
 	/* hash key */
 	unsigned int key;
-	/* member of the rr/busy/cur/idle cfqd list */
-	struct list_head cfq_list;
+	/* service_tree member */
+	struct rb_node rb_node;
+	/* service_tree key */
+	unsigned long rb_key;
 	/* sorted list of pending requests */
 	struct rb_root sort_list;
 	/* if fifo isn't expired, next request to serve */
@@ -147,11 +160,10 @@ struct cfq_queue {
 	struct list_head fifo;
 
 	unsigned long slice_end;
-	unsigned long service_last;
 	long slice_resid;
 
-	/* number of requests that are on the dispatch list */
-	int on_dispatch[2];
+	/* number of requests that are on the dispatch list or inside driver */
+	int dispatched;
 
 	/* io prio of this group */
 	unsigned short ioprio, org_ioprio;
@@ -159,6 +171,8 @@ struct cfq_queue {
 
 	/* various state flags, see below */
 	unsigned int flags;
+
+	sector_t last_request_pos;
 };
 
 enum cfqq_state_flags {
@@ -202,7 +216,7 @@ #undef CFQ_CFQQ_FNS
 
 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
 static void cfq_dispatch_insert(request_queue_t *, struct request *);
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
+static struct cfq_queue *cfq_get_queue(struct cfq_data *, unsigned int, struct task_struct *, gfp_t);
 
 /*
  * scheduler run of queue, if there are requests pending and no one in the
@@ -237,28 +251,26 @@ static inline pid_t cfq_queue_pid(struct
  * if a queue is marked sync and has sync io queued. A sync queue with async
  * io only, should not get full sync slice length.
  */
-static inline int
-cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync,
+				 unsigned short prio)
 {
-	const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
+	const int base_slice = cfqd->cfq_slice[sync];
 
-	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
+	WARN_ON(prio >= IOPRIO_BE_NR);
 
-	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
+	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
+}
+
+static inline int
+cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
 static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
-	cfqq->slice_end += cfqq->slice_resid;
-
-	/*
-	 * Don't carry over residual for more than one slice, we only want
-	 * to slightly correct the fairness. Carrying over forever would
-	 * easily introduce oscillations.
-	 */
-	cfqq->slice_resid = 0;
 }
 
 /*
@@ -307,7 +319,7 @@ #define CFQ_RQ2_WRAP	0x02 /* request 2 w
 	s1 = rq1->sector;
 	s2 = rq2->sector;
 
-	last = cfqd->last_sector;
+	last = cfqd->last_position;
 
 	/*
 	 * by definition, 1KiB is 2 sectors
@@ -372,6 +384,26 @@ #define CFQ_RQ2_WRAP	0x02 /* request 2 w
 }
 
 /*
+ * The below is leftmost cache rbtree addon
+ */
+static struct rb_node *cfq_rb_first(struct cfq_rb_root *root)
+{
+	if (root->left)
+		return root->left;
+
+	return rb_first(&root->rb);
+}
+
+static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
+{
+	if (root->left == n)
+		root->left = NULL;
+
+	rb_erase(n, &root->rb);
+	RB_CLEAR_NODE(n);
+}
+
+/*
  * would be nice to take fifo expire time into account as well
  */
 static struct request *
@@ -398,78 +430,96 @@ cfq_find_next_rq(struct cfq_data *cfqd, 
 	return cfq_choose_req(cfqd, next, prev);
 }
 
-static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
+static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
+				      struct cfq_queue *cfqq)
 {
-	struct cfq_data *cfqd = cfqq->cfqd;
-	struct list_head *list, *n;
-	struct cfq_queue *__cfqq;
-
 	/*
-	 * Resorting requires the cfqq to be on the RR list already.
+	 * just an approximation, should be ok.
 	 */
-	if (!cfq_cfqq_on_rr(cfqq))
-		return;
+	return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
+		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
+}
 
-	list_del(&cfqq->cfq_list);
+/*
+ * The cfqd->service_tree holds all pending cfq_queue's that have
+ * requests waiting to be processed. It is sorted in the order that
+ * we will service the queues.
+ */
+static void cfq_service_tree_add(struct cfq_data *cfqd,
+				    struct cfq_queue *cfqq, int add_front)
+{
+	struct rb_node **p = &cfqd->service_tree.rb.rb_node;
+	struct rb_node *parent = NULL;
+	unsigned long rb_key;
+	int left;
+
+	if (!add_front) {
+		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
+		rb_key += cfqq->slice_resid;
+		cfqq->slice_resid = 0;
+	} else
+		rb_key = 0;
 
-	if (cfq_class_rt(cfqq))
-		list = &cfqd->cur_rr;
-	else if (cfq_class_idle(cfqq))
-		list = &cfqd->idle_rr;
-	else {
+	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 		/*
-		 * if cfqq has requests in flight, don't allow it to be
-		 * found in cfq_set_active_queue before it has finished them.
-		 * this is done to increase fairness between a process that
-		 * has lots of io pending vs one that only generates one
-		 * sporadically or synchronously
+		 * same position, nothing more to do
 		 */
-		if (cfq_cfqq_dispatched(cfqq))
-			list = &cfqd->busy_rr;
-		else
-			list = &cfqd->rr_list[cfqq->ioprio];
+		if (rb_key == cfqq->rb_key)
+			return;
+
+		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
 	}
 
-	if (preempted || cfq_cfqq_queue_new(cfqq)) {
-		/*
-		 * If this queue was preempted or is new (never been serviced),
-		 * let it be added first for fairness but beind other new
-		 * queues.
-		 */
-		n = list;
-		while (n->next != list) {
-			__cfqq = list_entry_cfqq(n->next);
-			if (!cfq_cfqq_queue_new(__cfqq))
-				break;
+	left = 1;
+	while (*p) {
+		struct cfq_queue *__cfqq;
+		struct rb_node **n;
+
+		parent = *p;
+		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 
-			n = n->next;
-		}
-		list_add_tail(&cfqq->cfq_list, n);
-	} else if (!cfq_cfqq_class_sync(cfqq)) {
-		/*
-		 * async queue always goes to the end. this wont be overly
-		 * unfair to writes, as the sort of the sync queue wont be
-		 * allowed to pass the async queue again.
-		 */
-		list_add_tail(&cfqq->cfq_list, list);
-	} else {
 		/*
-		 * sort by last service, but don't cross a new or async
-		 * queue. we don't cross a new queue because it hasn't been
-		 * service before, and we don't cross an async queue because
-		 * it gets added to the end on expire.
+		 * sort RT queues first, we always want to give
+		 * preference to them. IDLE queues goes to the back.
+		 * after that, sort on the next service time.
 		 */
-		n = list;
-		while ((n = n->prev) != list) {
-			struct cfq_queue *__cfqq = list_entry_cfqq(n);
+		if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
+			n = &(*p)->rb_left;
+		else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
+			n = &(*p)->rb_right;
+		else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
+			n = &(*p)->rb_left;
+		else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
+			n = &(*p)->rb_right;
+		else if (rb_key < __cfqq->rb_key)
+			n = &(*p)->rb_left;
+		else
+			n = &(*p)->rb_right;
 
-			if (!cfq_cfqq_class_sync(cfqq) || !__cfqq->service_last)
-				break;
-			if (time_before(__cfqq->service_last, cfqq->service_last))
-				break;
-		}
-		list_add(&cfqq->cfq_list, n);
+		if (n == &(*p)->rb_right)
+			left = 0;
+
+		p = n;
 	}
+
+	if (left)
+		cfqd->service_tree.left = &cfqq->rb_node;
+
+	cfqq->rb_key = rb_key;
+	rb_link_node(&cfqq->rb_node, parent, p);
+	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
+}
+
+/*
+ * Update cfqq's position in the service tree.
+ */
+static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	/*
+	 * Resorting requires the cfqq to be on the RR list already.
+	 */
+	if (cfq_cfqq_on_rr(cfqq))
+		cfq_service_tree_add(cfqd, cfqq, 0);
 }
 
 /*
@@ -483,15 +533,21 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, s
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
 
-	cfq_resort_rr_list(cfqq, 0);
+	cfq_resort_rr_list(cfqd, cfqq);
 }
 
+/*
+ * Called when the cfqq no longer has requests pending, remove it from
+ * the service tree.
+ */
 static inline void
 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
-	list_del_init(&cfqq->cfq_list);
+
+	if (!RB_EMPTY_NODE(&cfqq->rb_node))
+		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
 
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
@@ -573,6 +629,8 @@ static void cfq_activate_request(request
 	 */
 	if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
 		cfqd->hw_tag = 1;
+
+	cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
 }
 
 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
@@ -599,8 +657,7 @@ static void cfq_remove_request(struct re
 	}
 }
 
-static int
-cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
+static int cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *__rq;
@@ -678,6 +735,7 @@ __cfq_set_active_queue(struct cfq_data *
 		cfq_clear_cfqq_must_alloc_slice(cfqq);
 		cfq_clear_cfqq_fifo_expire(cfqq);
 		cfq_mark_cfqq_slice_new(cfqq);
+		cfq_clear_cfqq_queue_new(cfqq);
 	}
 
 	cfqd->active_queue = cfqq;
@@ -688,23 +746,21 @@ __cfq_set_active_queue(struct cfq_data *
  */
 static void
 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		    int preempted, int timed_out)
+		    int timed_out)
 {
 	if (cfq_cfqq_wait_request(cfqq))
 		del_timer(&cfqd->idle_slice_timer);
 
 	cfq_clear_cfqq_must_dispatch(cfqq);
 	cfq_clear_cfqq_wait_request(cfqq);
-	cfq_clear_cfqq_queue_new(cfqq);
 
 	/*
-	 * store what was left of this slice, if the queue idled out
-	 * or was preempted
+	 * store what was left of this slice, if the queue idled/timed out
 	 */
 	if (timed_out && !cfq_cfqq_slice_new(cfqq))
 		cfqq->slice_resid = cfqq->slice_end - jiffies;
 
-	cfq_resort_rr_list(cfqq, preempted);
+	cfq_resort_rr_list(cfqd, cfqq);
 
 	if (cfqq == cfqd->active_queue)
 		cfqd->active_queue = NULL;
@@ -713,163 +769,152 @@ __cfq_slice_expired(struct cfq_data *cfq
 		put_io_context(cfqd->active_cic->ioc);
 		cfqd->active_cic = NULL;
 	}
-
-	cfqd->dispatch_slice = 0;
 }
 
-static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted,
-				     int timed_out)
+static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 
 	if (cfqq)
-		__cfq_slice_expired(cfqd, cfqq, preempted, timed_out);
+		__cfq_slice_expired(cfqd, cfqq, timed_out);
 }
 
 /*
- * 0
- * 0,1
- * 0,1,2
- * 0,1,2,3
- * 0,1,2,3,4
- * 0,1,2,3,4,5
- * 0,1,2,3,4,5,6
- * 0,1,2,3,4,5,6,7
+ * Get next queue for service. Unless we have a queue preemption,
+ * we'll simply select the first cfqq in the service tree.
  */
-static int cfq_get_next_prio_level(struct cfq_data *cfqd)
+static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
-	int prio, wrap;
+	struct cfq_queue *cfqq;
+	struct rb_node *n;
 
-	prio = -1;
-	wrap = 0;
-	do {
-		int p;
+	if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
+		return NULL;
 
-		for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
-			if (!list_empty(&cfqd->rr_list[p])) {
-				prio = p;
-				break;
-			}
-		}
+	n = cfq_rb_first(&cfqd->service_tree);
+	cfqq = rb_entry(n, struct cfq_queue, rb_node);
 
-		if (prio != -1)
-			break;
-		cfqd->cur_prio = 0;
-		if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
-			cfqd->cur_end_prio = 0;
-			if (wrap)
-				break;
-			wrap = 1;
-		}
-	} while (1);
+	if (cfq_class_idle(cfqq)) {
+		unsigned long end;
 
-	if (unlikely(prio == -1))
-		return -1;
+		/*
+		 * if we have idle queues and no rt or be queues had
+		 * pending requests, either allow immediate service if
+		 * the grace period has passed or arm the idle grace
+		 * timer
+		 */
+		end = cfqd->last_end_request + CFQ_IDLE_GRACE;
+		if (time_before(jiffies, end)) {
+			mod_timer(&cfqd->idle_class_timer, end);
+			cfqq = NULL;
+		}
+	}
 
-	BUG_ON(prio >= CFQ_PRIO_LISTS);
+	return cfqq;
+}
 
-	list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
+/*
+ * Get and set a new active queue for service.
+ */
+static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
+{
+	struct cfq_queue *cfqq;
 
-	cfqd->cur_prio = prio + 1;
-	if (cfqd->cur_prio > cfqd->cur_end_prio) {
-		cfqd->cur_end_prio = cfqd->cur_prio;
-		cfqd->cur_prio = 0;
-	}
-	if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
-		cfqd->cur_prio = 0;
-		cfqd->cur_end_prio = 0;
-	}
+	cfqq = cfq_get_next_queue(cfqd);
+	__cfq_set_active_queue(cfqd, cfqq);
+	return cfqq;
+}
 
-	return prio;
+static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
+					  struct request *rq)
+{
+	if (rq->sector >= cfqd->last_position)
+		return rq->sector - cfqd->last_position;
+	else
+		return cfqd->last_position - rq->sector;
 }
 
-static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
+static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
 {
-	struct cfq_queue *cfqq = NULL;
+	struct cfq_io_context *cic = cfqd->active_cic;
 
-	if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) {
-		/*
-		 * if current list is non-empty, grab first entry. if it is
-		 * empty, get next prio level and grab first entry then if any
-		 * are spliced
-		 */
-		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
-	} else if (!list_empty(&cfqd->busy_rr)) {
-		/*
-		 * If no new queues are available, check if the busy list has
-		 * some before falling back to idle io.
-		 */
-		cfqq = list_entry_cfqq(cfqd->busy_rr.next);
-	} else if (!list_empty(&cfqd->idle_rr)) {
-		/*
-		 * if we have idle queues and no rt or be queues had pending
-		 * requests, either allow immediate service if the grace period
-		 * has passed or arm the idle grace timer
-		 */
-		unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
+	if (!sample_valid(cic->seek_samples))
+		return 0;
 
-		if (time_after_eq(jiffies, end))
-			cfqq = list_entry_cfqq(cfqd->idle_rr.next);
-		else
-			mod_timer(&cfqd->idle_class_timer, end);
-	}
+	return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
+}
 
-	__cfq_set_active_queue(cfqd, cfqq);
-	return cfqq;
+static int cfq_close_cooperator(struct cfq_data *cfq_data,
+				struct cfq_queue *cfqq)
+{
+	/*
+	 * We should notice if some of the queues are cooperating, eg
+	 * working closely on the same area of the disk. In that case,
+	 * we can group them together and don't waste time idling.
+	 */
+	return 0;
 }
 
-#define CIC_SEEKY(cic) ((cic)->seek_mean > (128 * 1024))
+#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
 
-static int cfq_arm_slice_timer(struct cfq_data *cfqd)
+static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 	struct cfq_io_context *cic;
 	unsigned long sl;
 
 	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
+	WARN_ON(cfq_cfqq_slice_new(cfqq));
 
 	/*
 	 * idle is disabled, either manually or by past process history
 	 */
-	if (!cfqd->cfq_slice_idle)
-		return 0;
-	if (!cfq_cfqq_idle_window(cfqq))
-		return 0;
+	if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
+		return;
+
 	/*
 	 * task has exited, don't wait
 	 */
 	cic = cfqd->active_cic;
 	if (!cic || !cic->ioc->task)
-		return 0;
+		return;
+
+	/*
+	 * See if this prio level has a good candidate
+	 */
+	if (cfq_close_cooperator(cfqd, cfqq) &&
+	    (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2))
+		return;
 
 	cfq_mark_cfqq_must_dispatch(cfqq);
 	cfq_mark_cfqq_wait_request(cfqq);
 
-	sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
-
 	/*
 	 * we don't want to idle for seeks, but we do want to allow
 	 * fair distribution of slice time for a process doing back-to-back
 	 * seeks. so allow a little bit of time for him to submit a new rq
 	 */
+	sl = cfqd->cfq_slice_idle;
 	if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
-		sl = min(sl, msecs_to_jiffies(2));
+		sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
-	return 1;
 }
 
+/*
+ * Move request from internal lists to the request queue dispatch list.
+ */
 static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
 	cfq_remove_request(rq);
-	cfqq->on_dispatch[rq_is_sync(rq)]++;
+	cfqq->dispatched++;
 	elv_dispatch_sort(q, rq);
 
-	rq = list_entry(q->queue_head.prev, struct request, queuelist);
-	cfqd->last_sector = rq->sector + rq->nr_sectors;
+	if (cfq_cfqq_sync(cfqq))
+		cfqd->sync_flight++;
 }
 
 /*
@@ -889,13 +934,13 @@ static inline struct request *cfq_check_
 	if (list_empty(&cfqq->fifo))
 		return NULL;
 
-	fifo = cfq_cfqq_class_sync(cfqq);
+	fifo = cfq_cfqq_sync(cfqq);
 	rq = rq_entry_fifo(cfqq->fifo.next);
 
-	if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
-		return rq;
+	if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
+		return NULL;
 
-	return NULL;
+	return rq;
 }
 
 static inline int
@@ -909,7 +954,8 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd,
 }
 
 /*
- * get next queue for service
+ * Select a queue for service. If we have a current active queue,
+ * check whether to continue servicing it, or retrieve and set a new one.
  */
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 {
@@ -920,33 +966,41 @@ static struct cfq_queue *cfq_select_queu
 		goto new_queue;
 
 	/*
-	 * slice has expired
+	 * The active queue has run out of time, expire it and select new.
 	 */
-	if (!cfq_cfqq_must_dispatch(cfqq) && cfq_slice_used(cfqq))
+	if (cfq_slice_used(cfqq))
 		goto expire;
 
 	/*
-	 * if queue has requests, dispatch one. if not, check if
-	 * enough slice is left to wait for one
+	 * The active queue has requests and isn't expired, allow it to
+	 * dispatch.
 	 */
 	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto keep_queue;
-	else if (cfq_cfqq_slice_new(cfqq) || cfq_cfqq_dispatched(cfqq)) {
+
+	/*
+	 * No requests pending. If the active queue still has requests in
+	 * flight or is idling for a new request, allow either of these
+	 * conditions to happen (or time out) before selecting a new queue.
+	 */
+	if (timer_pending(&cfqd->idle_slice_timer) ||
+	    (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) {
 		cfqq = NULL;
 		goto keep_queue;
-	} else if (cfq_cfqq_class_sync(cfqq)) {
-		if (cfq_arm_slice_timer(cfqd))
-			return NULL;
 	}
 
 expire:
-	cfq_slice_expired(cfqd, 0, 0);
+	cfq_slice_expired(cfqd, 0);
 new_queue:
 	cfqq = cfq_set_active_queue(cfqd);
 keep_queue:
 	return cfqq;
 }
 
+/*
+ * Dispatch some requests from cfqq, moving them to the request queue
+ * dispatch list.
+ */
 static int
 __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			int max_dispatch)
@@ -969,7 +1023,6 @@ __cfq_dispatch_requests(struct cfq_data 
 		 */
 		cfq_dispatch_insert(cfqd->queue, rq);
 
-		cfqd->dispatch_slice++;
 		dispatched++;
 
 		if (!cfqd->active_cic) {
@@ -987,57 +1040,54 @@ __cfq_dispatch_requests(struct cfq_data 
 	 * queue always expire after 1 dispatch round.
 	 */
 	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
-	    cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
+	    dispatched >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
 	    cfq_class_idle(cfqq))) {
 		cfqq->slice_end = jiffies + 1;
-		cfq_slice_expired(cfqd, 0, 0);
+		cfq_slice_expired(cfqd, 0);
 	}
 
 	return dispatched;
 }
 
-static int
-cfq_forced_dispatch_cfqqs(struct list_head *list)
+static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
 {
-	struct cfq_queue *cfqq, *next;
-	int dispatched;
+	int dispatched = 0;
 
-	dispatched = 0;
-	list_for_each_entry_safe(cfqq, next, list, cfq_list) {
-		while (cfqq->next_rq) {
-			cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
-			dispatched++;
-		}
-		BUG_ON(!list_empty(&cfqq->fifo));
+	while (cfqq->next_rq) {
+		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
+		dispatched++;
 	}
 
+	BUG_ON(!list_empty(&cfqq->fifo));
 	return dispatched;
 }
 
-static int
-cfq_forced_dispatch(struct cfq_data *cfqd)
+/*
+ * Drain our current requests. Used for barriers and when switching
+ * io schedulers on-the-fly.
+ */
+static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
-	int i, dispatched = 0;
+	int dispatched = 0;
+	struct rb_node *n;
 
-	for (i = 0; i < CFQ_PRIO_LISTS; i++)
-		dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
+	while ((n = cfq_rb_first(&cfqd->service_tree)) != NULL) {
+		struct cfq_queue *cfqq = rb_entry(n, struct cfq_queue, rb_node);
 
-	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
-	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
-	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
+		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+	}
 
-	cfq_slice_expired(cfqd, 0, 0);
+	cfq_slice_expired(cfqd, 0);
 
 	BUG_ON(cfqd->busy_queues);
 
 	return dispatched;
 }
 
-static int
-cfq_dispatch_requests(request_queue_t *q, int force)
+static int cfq_dispatch_requests(request_queue_t *q, int force)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_queue *cfqq, *prev_cfqq;
+	struct cfq_queue *cfqq;
 	int dispatched;
 
 	if (!cfqd->busy_queues)
@@ -1047,36 +1097,28 @@ cfq_dispatch_requests(request_queue_t *q
 		return cfq_forced_dispatch(cfqd);
 
 	dispatched = 0;
-	prev_cfqq = NULL;
 	while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
 		int max_dispatch;
 
-		if (cfqd->busy_queues > 1) {
-			/*
-			 * Don't repeat dispatch from the previous queue.
-			 */
-			if (prev_cfqq == cfqq)
-				break;
+		max_dispatch = cfqd->cfq_quantum;
+		if (cfq_class_idle(cfqq))
+			max_dispatch = 1;
 
-			/*
-			 * So we have dispatched before in this round, if the
-			 * next queue has idling enabled (must be sync), don't
-			 * allow it service until the previous have continued.
-			 */
-			if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq))
+		if (cfqq->dispatched >= max_dispatch) {
+			if (cfqd->busy_queues > 1)
+				break;
+			if (cfqq->dispatched >= 4 * max_dispatch)
 				break;
 		}
 
+		if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
+			break;
+
 		cfq_clear_cfqq_must_dispatch(cfqq);
 		cfq_clear_cfqq_wait_request(cfqq);
 		del_timer(&cfqd->idle_slice_timer);
 
-		max_dispatch = cfqd->cfq_quantum;
-		if (cfq_class_idle(cfqq))
-			max_dispatch = 1;
-
 		dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
-		prev_cfqq = cfqq;
 	}
 
 	return dispatched;
@@ -1102,14 +1144,13 @@ static void cfq_put_queue(struct cfq_que
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
-		__cfq_slice_expired(cfqd, cfqq, 0, 0);
+		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 
 	/*
 	 * it's on the empty list and still hashed
 	 */
-	list_del(&cfqq->cfq_list);
 	hlist_del(&cfqq->cfq_hash);
 	kmem_cache_free(cfq_pool, cfqq);
 }
@@ -1160,7 +1201,7 @@ static void cfq_free_io_context(struct i
 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	if (unlikely(cfqq == cfqd->active_queue)) {
-		__cfq_slice_expired(cfqd, cfqq, 0, 0);
+		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 
@@ -1185,10 +1226,6 @@ static void __cfq_exit_single_io_context
 	}
 }
 
-
-/*
- * Called with interrupts disabled
- */
 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic->key;
@@ -1202,6 +1239,10 @@ static void cfq_exit_single_io_context(s
 	}
 }
 
+/*
+ * The process that ioc belongs to has exited, we need to clean up
+ * and put the internal structures we have that belongs to that process.
+ */
 static void cfq_exit_io_context(struct io_context *ioc)
 {
 	struct cfq_io_context *__cic;
@@ -1278,8 +1319,6 @@ static void cfq_init_prio_data(struct cf
 	 */
 	cfqq->org_ioprio = cfqq->ioprio;
 	cfqq->org_ioprio_class = cfqq->ioprio_class;
-
-	cfq_resort_rr_list(cfqq, 0);
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
@@ -1364,7 +1403,7 @@ retry:
 		memset(cfqq, 0, sizeof(*cfqq));
 
 		INIT_HLIST_NODE(&cfqq->cfq_hash);
-		INIT_LIST_HEAD(&cfqq->cfq_list);
+		RB_CLEAR_NODE(&cfqq->rb_node);
 		INIT_LIST_HEAD(&cfqq->fifo);
 
 		cfqq->key = key;
@@ -1389,6 +1428,9 @@ out:
 	return cfqq;
 }
 
+/*
+ * We drop cfq io contexts lazily, so we may find a dead one.
+ */
 static void
 cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
 {
@@ -1518,7 +1560,8 @@ cfq_update_io_thinktime(struct cfq_data 
 }
 
 static void
-cfq_update_io_seektime(struct cfq_io_context *cic, struct request *rq)
+cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
+		       struct request *rq)
 {
 	sector_t sdist;
 	u64 total;
@@ -1528,6 +1571,11 @@ cfq_update_io_seektime(struct cfq_io_con
 	else
 		sdist = cic->last_request_pos - rq->sector;
 
+	if (!cic->seek_samples) {
+		cfqd->new_seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
+		cfqd->new_seek_mean = cfqd->new_seek_total / 256;
+	}
+
 	/*
 	 * Don't allow the seek distance to get too large from the
 	 * odd fragment, pagein, etc
@@ -1552,7 +1600,12 @@ static void
 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct cfq_io_context *cic)
 {
-	int enable_idle = cfq_cfqq_idle_window(cfqq);
+	int enable_idle;
+
+	if (!cfq_cfqq_sync(cfqq))
+		return;
+
+	enable_idle = cfq_cfqq_idle_window(cfqq);
 
 	if (!cic->ioc->task || !cfqd->cfq_slice_idle ||
 	    (cfqd->hw_tag && CIC_SEEKY(cic)))
@@ -1578,24 +1631,28 @@ static int
 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 		   struct request *rq)
 {
-	struct cfq_queue *cfqq = cfqd->active_queue;
+	struct cfq_queue *cfqq;
 
-	if (cfq_class_idle(new_cfqq))
+	cfqq = cfqd->active_queue;
+	if (!cfqq)
 		return 0;
 
-	if (!cfqq)
+	if (cfq_slice_used(cfqq))
+		return 1;
+
+	if (cfq_class_idle(new_cfqq))
 		return 0;
 
 	if (cfq_class_idle(cfqq))
 		return 1;
-	if (!cfq_cfqq_wait_request(new_cfqq))
-		return 0;
+
 	/*
 	 * if the new request is sync, but the currently running queue is
 	 * not, let the sync request have priority.
 	 */
 	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
 		return 1;
+
 	/*
 	 * So both queues are sync. Let the new request get disk time if
 	 * it's a metadata request and the current queue is doing regular IO.
@@ -1603,6 +1660,16 @@ cfq_should_preempt(struct cfq_data *cfqd
 	if (rq_is_meta(rq) && !cfqq->meta_pending)
 		return 1;
 
+	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
+		return 0;
+
+	/*
+	 * if this request is as-good as one we would expect from the
+	 * current cfqq, let it preempt
+	 */
+	if (cfq_rq_close(cfqd, rq))
+		return 1;
+
 	return 0;
 }
 
@@ -1612,14 +1679,15 @@ cfq_should_preempt(struct cfq_data *cfqd
  */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	cfq_slice_expired(cfqd, 1, 1);
+	cfq_slice_expired(cfqd, 1);
 
 	/*
 	 * Put the new queue at the front of the of the current list,
 	 * so we know that it will be selected next.
 	 */
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
-	list_move(&cfqq->cfq_list, &cfqd->cur_rr);
+
+	cfq_service_tree_add(cfqd, cfqq, 1);
 
 	cfqq->slice_end = 0;
 	cfq_mark_cfqq_slice_new(cfqq);
@@ -1644,28 +1712,12 @@ cfq_rq_enqueued(struct cfq_data *cfqd, s
 	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
 	BUG_ON(!cfqq->next_rq);
 
-	/*
-	 * we never wait for an async request and we don't allow preemption
-	 * of an async request. so just return early
-	 */
-	if (!rq_is_sync(rq)) {
-		/*
-		 * sync process issued an async request, if it's waiting
-		 * then expire it and kick rq handling.
-		 */
-		if (cic == cfqd->active_cic &&
-		    del_timer(&cfqd->idle_slice_timer)) {
-			cfq_slice_expired(cfqd, 0, 0);
-			blk_start_queueing(cfqd->queue);
-		}
-		return;
-	}
-
 	cfq_update_io_thinktime(cfqd, cic);
-	cfq_update_io_seektime(cic, rq);
+	cfq_update_io_seektime(cfqd, cic, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
 	cic->last_request_pos = rq->sector + rq->nr_sectors;
+	cfqq->last_request_pos = cic->last_request_pos;
 
 	if (cfqq == cfqd->active_queue) {
 		/*
@@ -1714,16 +1766,16 @@ static void cfq_completed_request(reques
 	now = jiffies;
 
 	WARN_ON(!cfqd->rq_in_driver);
-	WARN_ON(!cfqq->on_dispatch[sync]);
+	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
-	cfqq->on_dispatch[sync]--;
-	cfqq->service_last = now;
+	cfqq->dispatched--;
+
+	if (cfq_cfqq_sync(cfqq))
+		cfqd->sync_flight--;
 
 	if (!cfq_class_idle(cfqq))
 		cfqd->last_end_request = now;
 
-	cfq_resort_rr_list(cfqq, 0);
-
 	if (sync)
 		RQ_CIC(rq)->last_end_request = now;
 
@@ -1737,12 +1789,13 @@ static void cfq_completed_request(reques
 			cfq_clear_cfqq_slice_new(cfqq);
 		}
 		if (cfq_slice_used(cfqq))
-			cfq_slice_expired(cfqd, 0, 1);
-		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) {
-			if (!cfq_arm_slice_timer(cfqd))
-				cfq_schedule_dispatch(cfqd);
-		}
+			cfq_slice_expired(cfqd, 1);
+		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
+			cfq_arm_slice_timer(cfqd);
 	}
+
+	if (!cfqd->rq_in_driver)
+		cfq_schedule_dispatch(cfqd);
 }
 
 /*
@@ -1751,9 +1804,6 @@ static void cfq_completed_request(reques
  */
 static void cfq_prio_boost(struct cfq_queue *cfqq)
 {
-	const int ioprio_class = cfqq->ioprio_class;
-	const int ioprio = cfqq->ioprio;
-
 	if (has_fs_excl()) {
 		/*
 		 * boost idle prio on transactions that would lock out other
@@ -1772,12 +1822,6 @@ static void cfq_prio_boost(struct cfq_qu
 		if (cfqq->ioprio != cfqq->org_ioprio)
 			cfqq->ioprio = cfqq->org_ioprio;
 	}
-
-	/*
-	 * refile between round-robin lists if we moved the priority class
-	 */
-	if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio))
-		cfq_resort_rr_list(cfqq, 0);
 }
 
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
@@ -1940,7 +1984,7 @@ static void cfq_idle_slice_timer(unsigne
 		}
 	}
 expire:
-	cfq_slice_expired(cfqd, 0, timed_out);
+	cfq_slice_expired(cfqd, timed_out);
 out_kick:
 	cfq_schedule_dispatch(cfqd);
 out_cont:
@@ -1986,7 +2030,7 @@ static void cfq_exit_queue(elevator_t *e
 	spin_lock_irq(q->queue_lock);
 
 	if (cfqd->active_queue)
-		__cfq_slice_expired(cfqd, cfqd->active_queue, 0, 0);
+		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 
 	while (!list_empty(&cfqd->cic_list)) {
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
@@ -2015,12 +2059,7 @@ static void *cfq_init_queue(request_queu
 
 	memset(cfqd, 0, sizeof(*cfqd));
 
-	for (i = 0; i < CFQ_PRIO_LISTS; i++)
-		INIT_LIST_HEAD(&cfqd->rr_list[i]);
-
-	INIT_LIST_HEAD(&cfqd->busy_rr);
-	INIT_LIST_HEAD(&cfqd->cur_rr);
-	INIT_LIST_HEAD(&cfqd->idle_rr);
+	cfqd->service_tree = CFQ_RB_ROOT;
 	INIT_LIST_HEAD(&cfqd->cic_list);
 
 	cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node);
@@ -2087,7 +2126,6 @@ fail:
 /*
  * sysfs parts below -->
  */
-
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
 {