GIT ca3200d2d2ac4589f77764dff841b5ae1c3b1495 master.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git#cfq

commit ca3200d2d2ac4589f77764dff841b5ae1c3b1495
Author: Jens Axboe <axboe@suse.de>
Date:   Sat Nov 26 15:42:12 2005 +0100

    [BLOCK] cfq-iosched: seek and async performance fixes
    
    Detect whether a given process is seeky and if so disable (mostly) the
    idle window if it is. We still allow just a little idle time, just enough
    to allow that process to submit a new request. That is needed to maintain
    fairness across priority groups.
    
    In some cases, we could setup several async queues. This is not optimal
    from a performance POV, since we want all async io in one queue to perform
    good sorting on it. It also impacted sync queues, as async io got too much
    slice time.
    
    Signed-off-by: Jens Axboe <axboe@suse.de>

commit 48a4db6338de7493686117858f06600e10b6f90e
Author: Jens Axboe <axboe@suse.de>
Date:   Thu Nov 17 14:47:45 2005 +0100

    [BLOCK] Fix oops on io scheduler unload
    
    We only check for 'ioc' existence with 'as', not 'cfq'. Move on if
    ->io_context isn't set.
    
    Spotted by Dirk Henning Gerdes <mail@dirk-gerdes.de>
    
    Signed-off-by: Jens Axboe <axboe@suse.de>

commit 4e9d7d5f8457c01ef345f27a549e7e821c0feda0
Author: Jens Axboe <axboe@suse.de>
Date:   Tue Nov 15 14:37:12 2005 +0100

    [BLOCK] cfq-iosched: Fix oops with only idle queues
    
    If we entered cfq_get_next_cfqq() with cfqd->busy_queues != 0, we may
    still end up with empty cur_rr at the end if there are only idle
    queues in the system. So check for that.

commit c5156c005310026001c7ff56891cfcc533a1e7a5
Author: Jens Axboe <axboe@suse.de>
Date:   Tue Nov 15 14:02:45 2005 +0100

    [BLOCK] cfq-iosched: change from escalator to staircase type service
    
    Currently, the priority RR algorithm in CFQ behaves like a see-saw,
    where the swing extends one extra prio level per iteration until they
    are all covered (then it starts over). This works fine for bandwidth
    distribution, but not so well for latencies. Writing a test model for
    this algorithm, gives the following computed latencies for one process
    running at each (of 8) priority levels:
    
    prio0: 30.01% disk time,  700msec max latency (tested  710msec)
    prio1: 23.34% disk time,  900msec max latency (tested  728msec)
    prio2: 17.50% disk time, 1260msec max latency (tested 1084msec)
    prio3: 12.50% disk time, 1760msec max latency (tested 1581msec)
    prio4:  8.33% disk time, 2380msec max latency (tested 2228msec)
    prio5:  5.00% disk time, 3100msec max latency (tested 2922msec)
    prio6:  2.50% disk time, 3900msec max latency (tested 3730msec)
    prio7:  0.83% disk time, 4760msec max latency (tested 4588msec)
    
    'tested' shows actual latencies measured with 'fio', reality matches
    the theory. So far, so good. If we simulate 3 processes at each
    prio level, the max latency for prio0 rises to 2460msec. prio4 (which
    is the default for a process) rises to 7340msec!
    
    Looking at a more generic staircase model where the climb down the
    stairs (priority levels) and let the dynamic priority of a process
    increase until it hits the top and then slide back to its original
    prio, could be more interesting from a latency POV. Simulating that
    shows (for 1 process at each level):
    
    prio0: 27.85% disk time,  400msec max latency (tested  405msec)
    prio1: 22.15% disk time,  420msec max latency (tested  429msec)
    prio2: 17.09% disk time,  620msec max latency (tested  662msec)
    prio3: 12.66% disk time, 1080msec max latency (tested 1126msec)
    prio4:  8.86% disk time, 1600msec max latency (tested 1641msec)
    prio5:  5.70% disk time, 2140msec max latency (tested 2182msec)
    prio6:  3.16% disk time, 2660msec max latency (tested 2669msec)
    prio7:  2.53% disk time, 2800msec max latency (tested 2803msec)
    
    Latency is almost halved, while the aggregate and individually
    measured throughput is the same. Service distribution is a little
    different from the old algorithm, however not very much. For 3
    processes at each level, prio0 has a max latency of 1440msec and
    6120msec for prio4.
    
    As a bonus, we drop one list from cfqd and make the code a lot more
    readable.
    
    Signed-off-by: Jens Axboe <axboe@suse.de>

commit 7d617fa044d925ec26641dc8e8fa81d6643c1f33
Author: Jens Axboe <axboe@suse.de>
Date:   Tue Nov 15 14:01:40 2005 +0100

    [BLOCK] cfq-iosched: change cfq io context linking from list to tree
    
    On setups with many disks, we spend a considerable amount of time
    looking up the process-disk mapping on each queue of io. Testing with
    a NULL based block driver, this costs 40-50% reduction in throughput
    for 1000 disks.
    
    Signed-off-by: Jens Axboe <axboe@suse.de>


---
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ee0bb41..219d122 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -34,13 +34,12 @@ static int cfq_back_penalty = 2;	/* pena
 static int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
 static int cfq_slice_async_rq = 2;
-static int cfq_slice_idle = HZ / 100;
+static int cfq_slice_idle = HZ / 70;
 
 #define CFQ_IDLE_GRACE		(HZ / 10)
 #define CFQ_SLICE_SCALE		(5)
 
 #define CFQ_KEY_ASYNC		(0)
-#define CFQ_KEY_ANY		(0xffff)
 
 /*
  * disable queueing at the driver/hardware level
@@ -105,6 +104,8 @@ static kmem_cache_t *cfq_ioc_pool;
 #define cfq_cfqq_sync(cfqq)		\
 	(cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
 
+#define sample_valid(samples)	((samples) > 80)
+
 /*
  * Per block device queue structure
  */
@@ -116,8 +117,9 @@ struct cfq_data {
 	 * rr list of queues with requests and the count of them
 	 */
 	struct list_head rr_list[CFQ_PRIO_LISTS];
-	struct list_head busy_rr;
 	struct list_head cur_rr;
+	unsigned short cur_prio;
+
 	struct list_head idle_rr;
 	unsigned int busy_queues;
 
@@ -153,7 +155,6 @@ struct cfq_data {
 
 	struct cfq_queue *active_queue;
 	struct cfq_io_context *active_cic;
-	int cur_prio, cur_end_prio;
 	unsigned int dispatch_slice;
 
 	struct timer_list idle_class_timer;
@@ -211,8 +212,13 @@ struct cfq_queue {
 	int on_dispatch[2];
 
 	/* io prio of this group */
-	unsigned short ioprio, org_ioprio;
-	unsigned short ioprio_class, org_ioprio_class;
+	unsigned short ioprio_class, ioprio;
+
+	/* current dynamic stair priority */
+	unsigned short dyn_ioprio;
+
+	/* same as real ioprio, except if queue has been elevated */
+	unsigned short org_ioprio_class, org_ioprio;
 
 	/* various state flags, see below */
 	unsigned int flags;
@@ -347,6 +353,14 @@ static int cfq_queue_empty(request_queue
 	return !cfqd->busy_queues;
 }
 
+static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
+{
+	if (rw == READ || process_sync(task))
+		return task->pid;
+
+	return CFQ_KEY_ASYNC;
+}
+
 /*
  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
  * We choose the request that is closest to the head right now. Distance
@@ -471,25 +485,13 @@ static void cfq_resort_rr_list(struct cf
 		list = &cfqd->cur_rr;
 	else if (cfq_class_idle(cfqq))
 		list = &cfqd->idle_rr;
-	else {
-		/*
-		 * if cfqq has requests in flight, don't allow it to be
-		 * found in cfq_set_active_queue before it has finished them.
-		 * this is done to increase fairness between a process that
-		 * has lots of io pending vs one that only generates one
-		 * sporadically or synchronously
-		 */
-		if (cfq_cfqq_dispatched(cfqq))
-			list = &cfqd->busy_rr;
-		else
-			list = &cfqd->rr_list[cfqq->ioprio];
-	}
+	else
+		list = &cfqd->rr_list[cfqq->dyn_ioprio];
 
 	/*
-	 * if queue was preempted, just add to front to be fair. busy_rr
-	 * isn't sorted.
+	 * if queue was preempted, just add to front to be fair.
 	 */
-	if (preempted || list == &cfqd->busy_rr) {
+	if (preempted) {
 		list_add(&cfqq->cfq_list, list);
 		return;
 	}
@@ -501,6 +503,8 @@ static void cfq_resort_rr_list(struct cf
 	while ((entry = entry->prev) != list) {
 		struct cfq_queue *__cfqq = list_entry_cfqq(entry);
 
+		if (__cfqq->ioprio < cfqq->ioprio)
+			break;
 		if (!__cfqq->service_last)
 			break;
 		if (time_before(__cfqq->service_last, cfqq->service_last))
@@ -616,15 +620,20 @@ cfq_reposition_crq_rb(struct cfq_queue *
 	cfq_add_crq_rb(crq);
 }
 
-static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
-
+static struct request *
+cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 {
-	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY);
+	struct task_struct *tsk = current;
+	pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio));
+	struct cfq_queue *cfqq;
 	struct rb_node *n;
+	sector_t sector;
 
+	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
 	if (!cfqq)
 		goto out;
 
+	sector = bio->bi_sector + bio_sectors(bio);
 	n = cfqq->sort_list.rb_node;
 	while (n) {
 		struct cfq_rq *crq = rb_entry_crq(n);
@@ -678,7 +687,7 @@ cfq_merge(request_queue_t *q, struct req
 		goto out;
 	}
 
-	__rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
+	__rq = cfq_find_rq_fmerge(cfqd, bio);
 	if (__rq && elv_rq_merge_ok(__rq, bio)) {
 		ret = ELEVATOR_FRONT_MERGE;
 		goto out;
@@ -722,81 +731,100 @@ cfq_merged_requests(request_queue_t *q, 
 	cfq_remove_request(next);
 }
 
+/*
+ * Scale schedule slice based on io priority. Use the sync time slice only
+ * if a queue is marked sync and has sync io queued. A sync queue with async
+ * io only, should not get full sync slice length.
+ */
+static inline int
+cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
+	unsigned short prio = cfqq->dyn_ioprio;
+
+	WARN_ON(prio >= IOPRIO_BE_NR);
+
+	if (cfq_class_rt(cfqq))
+		prio = 0;
+
+	return base_slice + (base_slice / CFQ_SLICE_SCALE * (4 - prio));
+}
+
 static inline void
-__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	if (cfqq) {
-		/*
-		 * stop potential idle class queues waiting service
-		 */
-		del_timer(&cfqd->idle_class_timer);
+	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+}
 
-		cfqq->slice_start = jiffies;
-		cfqq->slice_end = 0;
-		cfqq->slice_left = 0;
-		cfq_clear_cfqq_must_alloc_slice(cfqq);
-		cfq_clear_cfqq_fifo_expire(cfqq);
-		cfq_clear_cfqq_expired(cfqq);
-	}
+static inline int
+cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	const int base_rq = cfqd->cfq_slice_async_rq;
+	unsigned short prio = cfqq->dyn_ioprio;
 
-	cfqd->active_queue = cfqq;
+	WARN_ON(cfqq->dyn_ioprio >= IOPRIO_BE_NR);
+
+	if (cfq_class_rt(cfqq))
+		prio = 0;
+
+	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - prio));
 }
 
-/*
- * 0
- * 0,1
- * 0,1,2
- * 0,1,2,3
- * 0,1,2,3,4
- * 0,1,2,3,4,5
- * 0,1,2,3,4,5,6
- * 0,1,2,3,4,5,6,7
- */
-static int cfq_get_next_prio_level(struct cfq_data *cfqd)
+static inline void cfq_prio_inc(unsigned short *p, unsigned int low_p)
 {
-	int prio, wrap;
+	if (++(*p) == CFQ_PRIO_LISTS)
+		*p = low_p;
+}
 
-	prio = -1;
-	wrap = 0;
-	do {
-		int p;
+static struct cfq_queue *cfq_get_next_cfqq(struct cfq_data *cfqd)
+{
+	if (!cfqd->busy_queues)
+		return NULL;
 
-		for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
-			if (!list_empty(&cfqd->rr_list[p])) {
-				prio = p;
+	if (list_empty(&cfqd->cur_rr)) {
+		unsigned short prio = cfqd->cur_prio;
+
+		do {
+			struct list_head *list = &cfqd->rr_list[prio];
+
+			if (!list_empty(list)) {
+				list_splice_init(list, &cfqd->cur_rr);
 				break;
 			}
-		}
 
-		if (prio != -1)
-			break;
-		cfqd->cur_prio = 0;
-		if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
-			cfqd->cur_end_prio = 0;
-			if (wrap)
-				break;
-			wrap = 1;
-		}
-	} while (1);
+			cfq_prio_inc(&prio, 0);
 
-	if (unlikely(prio == -1))
-		return -1;
+		} while (prio != cfqd->cur_prio);
 
-	BUG_ON(prio >= CFQ_PRIO_LISTS);
+		cfq_prio_inc(&cfqd->cur_prio, 0);
+	}
 
-	list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
+	if (!list_empty(&cfqd->cur_rr));
+		return list_entry_cfqq(cfqd->cur_rr.next);
 
-	cfqd->cur_prio = prio + 1;
-	if (cfqd->cur_prio > cfqd->cur_end_prio) {
-		cfqd->cur_end_prio = cfqd->cur_prio;
-		cfqd->cur_prio = 0;
-	}
-	if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
-		cfqd->cur_prio = 0;
-		cfqd->cur_end_prio = 0;
+	return NULL;
+}
+
+static inline void
+__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	if (cfqq) {
+		WARN_ON(RB_EMPTY(&cfqq->sort_list));
+
+		/*
+		 * stop potential idle class queues waiting service
+		 */
+		del_timer(&cfqd->idle_class_timer);
+
+		cfqq->slice_start = jiffies;
+		cfqq->slice_end = 0;
+		cfqq->slice_left = 0;
+		cfq_clear_cfqq_must_alloc_slice(cfqq);
+		cfq_clear_cfqq_fifo_expire(cfqq);
+		cfq_clear_cfqq_expired(cfqq);
 	}
 
-	return prio;
+	cfqd->active_queue = cfqq;
 }
 
 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
@@ -809,15 +837,10 @@ static struct cfq_queue *cfq_set_active_
 	 */
 	if ((cfqq = cfqd->active_queue) != NULL) {
 		if (cfq_cfqq_expired(cfqq) && cfq_cfqq_dispatched(cfqq))
-			return NULL;
+			return cfqq;
 	}
 
-	/*
-	 * if current list is non-empty, grab first entry. if it is empty,
-	 * get next prio level and grab first entry then if any are spliced
-	 */
-	if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
-		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
+	cfqq = cfq_get_next_cfqq(cfqd);
 
 	/*
 	 * if we have idle queues and no rt or be queues had pending
@@ -842,7 +865,7 @@ static struct cfq_queue *cfq_set_active_
  */
 static void
 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		    int preempted)
+		    int preempted, int force)
 {
 	unsigned long now = jiffies;
 
@@ -864,39 +887,44 @@ __cfq_slice_expired(struct cfq_data *cfq
 	else
 		cfqq->slice_left = 0;
 
+	cfq_prio_inc(&cfqq->dyn_ioprio, cfqq->ioprio);
+
 	if (cfq_cfqq_on_rr(cfqq))
 		cfq_resort_rr_list(cfqq, preempted);
 
-	if (cfqq == cfqd->active_queue)
-		cfqd->active_queue = NULL;
+	/*
+	 * use deferred expiry, if there are requests in progress as
+	 * not to disturb the slice of the next queue
+	 */
+	if (cfq_cfqq_dispatched(cfqq) && !force)
+		cfq_mark_cfqq_expired(cfqq);
+	else {
+		if (cfqq == cfqd->active_queue)
+			cfqd->active_queue = NULL;
+
+		if (cfqd->active_cic) {
+			put_io_context(cfqd->active_cic->ioc);
+			cfqd->active_cic = NULL;
+		}
 
-	if (cfqd->active_cic) {
-		put_io_context(cfqd->active_cic->ioc);
-		cfqd->active_cic = NULL;
+		cfqd->dispatch_slice = 0;
 	}
-
-	cfqd->dispatch_slice = 0;
 }
 
 static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 
-	if (cfqq) {
-		/*
-		 * use deferred expiry, if there are requests in progress as
-		 * not to disturb the slice of the next queue
-		 */
-		if (cfq_cfqq_dispatched(cfqq))
-			cfq_mark_cfqq_expired(cfqq);
-		else
-			__cfq_slice_expired(cfqd, cfqq, preempted);
-	}
+	if (cfqq)
+		__cfq_slice_expired(cfqd, cfqq, preempted, 0);
 }
 
 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 {
+	struct cfq_io_context *cic;
+	unsigned long sl;
+
 	WARN_ON(!RB_EMPTY(&cfqq->sort_list));
 	WARN_ON(cfqq != cfqd->active_queue);
 
@@ -910,19 +938,31 @@ static int cfq_arm_slice_timer(struct cf
 	/*
 	 * task has exited, don't wait
 	 */
-	if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
+	cic = cfqd->active_cic;
+	if (cic && !cic->ioc->task)
 		return 0;
 
+	/*
+	 * If timer is already running, continue waiting. If not, mark
+	 * us as waiting for a request and arm the idle timer
+	 */
+	if (timer_pending(&cfqd->idle_slice_timer))
+		return 1;
+
 	cfq_mark_cfqq_must_dispatch(cfqq);
 	cfq_mark_cfqq_wait_request(cfqq);
 
-	if (!timer_pending(&cfqd->idle_slice_timer)) {
-		unsigned long slice_left = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
-
-		cfqd->idle_slice_timer.expires = jiffies + slice_left;
-		add_timer(&cfqd->idle_slice_timer);
-	}
+	sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
+	/*
+	 * we don't want to idle for seeks, but we do want to allow
+	 * fair distribution of slice time for a process doing back-to-back
+	 * seeks. so allow a little bit of time for him to submit a new rq
+	 */
+	if (sample_valid(cic->seek_samples) && cic->seek_mean > 131072)
+		sl = 2;
 
+	cfqd->idle_slice_timer.expires = jiffies + sl;
+	add_timer(&cfqd->idle_slice_timer);
 	return 1;
 }
 
@@ -964,37 +1004,6 @@ static inline struct cfq_rq *cfq_check_f
 }
 
 /*
- * Scale schedule slice based on io priority. Use the sync time slice only
- * if a queue is marked sync and has sync io queued. A sync queue with async
- * io only, should not get full sync slice length.
- */
-static inline int
-cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
-
-	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
-
-	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
-}
-
-static inline void
-cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
-}
-
-static inline int
-cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	const int base_rq = cfqd->cfq_slice_async_rq;
-
-	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
-
-	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
-}
-
-/*
  * get next queue for service
  */
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
@@ -1007,7 +1016,7 @@ static struct cfq_queue *cfq_select_queu
 		goto new_queue;
 
 	if (cfq_cfqq_expired(cfqq))
-		goto new_queue;
+		goto keep_queue;
 
 	/*
 	 * slice has expired
@@ -1114,7 +1123,6 @@ cfq_forced_dispatch(struct cfq_data *cfq
 	for (i = 0; i < CFQ_PRIO_LISTS; i++)
 		dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
 
-	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
 	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
 	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
 
@@ -1141,6 +1149,9 @@ cfq_dispatch_requests(request_queue_t *q
 	if (cfqq) {
 		int max_dispatch;
 
+		if (cfq_cfqq_expired(cfqq))
+			return 0;
+
 		/*
 		 * if idle window is disabled, allow queue buildup
 		 */
@@ -1182,18 +1193,18 @@ static void cfq_put_queue(struct cfq_que
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
-		__cfq_slice_expired(cfqd, cfqq, 0);
+		__cfq_slice_expired(cfqd, cfqq, 0, 1);
 		cfq_schedule_dispatch(cfqd);
 	}
 
-	cfq_put_cfqd(cfqq->cfqd);
-
 	/*
 	 * it's on the empty list and still hashed
 	 */
 	list_del(&cfqq->cfq_list);
 	hlist_del(&cfqq->cfq_hash);
 	kmem_cache_free(cfq_pool, cfqq);
+
+	cfq_put_cfqd(cfqd);
 }
 
 static inline struct cfq_queue *
@@ -1201,13 +1212,13 @@ __cfq_find_cfq_hash(struct cfq_data *cfq
 		    const int hashval)
 {
 	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
-	struct hlist_node *entry, *next;
+	struct hlist_node *entry;
+	struct cfq_queue *__cfqq;
 
-	hlist_for_each_safe(entry, next, hash_list) {
-		struct cfq_queue *__cfqq = list_entry_qhash(entry);
+	hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
 		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->ioprio_class, __cfqq->ioprio);
 
-		if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
+		if (__cfqq->key == key && (__p == prio || !prio))
 			return __cfqq;
 	}
 
@@ -1220,17 +1231,17 @@ cfq_find_cfq_hash(struct cfq_data *cfqd,
 	return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
 }
 
-static void cfq_free_io_context(struct cfq_io_context *cic)
+static void cfq_free_io_context(struct io_context *ioc)
 {
 	struct cfq_io_context *__cic;
-	struct list_head *entry, *next;
+	struct rb_node *n;
+
+	while ((n = rb_first(&ioc->cic_root)) != NULL) {
+		__cic = rb_entry(n, struct cfq_io_context, rb_node);
 
-	list_for_each_safe(entry, next, &cic->list) {
-		__cic = list_entry(entry, struct cfq_io_context, list);
+		rb_erase(&__cic->rb_node, &ioc->cic_root);
 		kmem_cache_free(cfq_ioc_pool, __cic);
 	}
-
-	kmem_cache_free(cfq_ioc_pool, cic);
 }
 
 /*
@@ -1246,7 +1257,7 @@ static void cfq_exit_single_io_context(s
 	spin_lock(q->queue_lock);
 
 	if (unlikely(cic->cfqq == cfqd->active_queue)) {
-		__cfq_slice_expired(cfqd, cic->cfqq, 0);
+		__cfq_slice_expired(cfqd, cic->cfqq, 0, 1);
 		cfq_schedule_dispatch(cfqd);
 	}
 
@@ -1255,27 +1266,25 @@ static void cfq_exit_single_io_context(s
 	spin_unlock(q->queue_lock);
 }
 
-/*
- * Another task may update the task cic list, if it is doing a queue lookup
- * on its behalf. cfq_cic_lock excludes such concurrent updates
- */
-static void cfq_exit_io_context(struct cfq_io_context *cic)
+static void cfq_exit_io_context(struct io_context *ioc)
 {
 	struct cfq_io_context *__cic;
-	struct list_head *entry;
 	unsigned long flags;
+	struct rb_node *n;
 
 	local_irq_save(flags);
 
 	/*
 	 * put the reference this task is holding to the various queues
 	 */
-	list_for_each(entry, &cic->list) {
-		__cic = list_entry(entry, struct cfq_io_context, list);
+	n = rb_first(&ioc->cic_root);
+	while (n != NULL) {
+		__cic = rb_entry(n, struct cfq_io_context, rb_node);
+
 		cfq_exit_single_io_context(__cic);
+		n = rb_next(n);
 	}
 
-	cfq_exit_single_io_context(cic);
 	local_irq_restore(flags);
 }
 
@@ -1285,9 +1294,9 @@ cfq_alloc_io_context(struct cfq_data *cf
 	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
 
 	if (cic) {
-		INIT_LIST_HEAD(&cic->list);
-		cic->cfqq = NULL;
+		RB_CLEAR(&cic->rb_node);
 		cic->key = NULL;
+		cic->cfqq = NULL;
 		cic->last_end_request = jiffies;
 		cic->ttime_total = 0;
 		cic->ttime_samples = 0;
@@ -1340,6 +1349,11 @@ static void cfq_init_prio_data(struct cf
 	cfqq->org_ioprio = cfqq->ioprio;
 	cfqq->org_ioprio_class = cfqq->ioprio_class;
 
+	/*
+	 * start priority
+	 */
+	cfqq->dyn_ioprio = cfqq->ioprio;
+
 	if (cfq_cfqq_on_rr(cfqq))
 		cfq_resort_rr_list(cfqq, 0);
 
@@ -1363,12 +1377,16 @@ static inline void changed_ioprio(struct
  */
 static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
 {
-	struct cfq_io_context *cic = ioc->cic;
+	struct cfq_io_context *cic;
+	struct rb_node *n;
 
-	changed_ioprio(cic->cfqq);
+	n = rb_first(&ioc->cic_root);
+	while (n != NULL) {
+		cic = rb_entry(n, struct cfq_io_context, rb_node);
 
-	list_for_each_entry(cic, &cic->list, list)
 		changed_ioprio(cic->cfqq);
+		n = rb_next(n);
+	}
 
 	return 0;
 }
@@ -1429,14 +1447,62 @@ out:
 	return cfqq;
 }
 
+static struct cfq_io_context *
+cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
+{
+	struct rb_node *n = ioc->cic_root.rb_node;
+	struct cfq_io_context *cic;
+	void *key = cfqd;
+
+	while (n) {
+		cic = rb_entry(n, struct cfq_io_context, rb_node);
+
+		if (key < cic->key)
+			n = n->rb_left;
+		else if (key > cic->key)
+			n = n->rb_right;
+		else
+			return cic;
+	}
+
+	return NULL;
+}
+
+static inline void
+cfq_cic_rb_add(struct cfq_data *cfqd, struct io_context *ioc,
+	       struct cfq_io_context *cic)
+{
+	struct rb_node **p = &ioc->cic_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct cfq_io_context *__cic;
+
+	cic->ioc = ioc;
+	cic->key = cfqd;
+
+	while (*p) {
+		parent = *p;
+		__cic = rb_entry(parent, struct cfq_io_context, rb_node);
+
+		if (cic->key < __cic->key)
+			p = &(*p)->rb_left;
+		else if (cic->key > __cic->key)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&cic->rb_node, parent, p);
+	rb_insert_color(&cic->rb_node, &ioc->cic_root);
+	atomic_inc(&cfqd->ref);
+}
+
 /*
  * Setup general io context and cfq io context. There can be several cfq
  * io contexts per general io context, if this process is doing io to more
- * than one device managed by cfq. Note that caller is holding a reference to
- * cfqq, so we don't need to worry about it disappearing
+ * than one device managed by cfq.
  */
 static struct cfq_io_context *
-cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
+cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct io_context *ioc = NULL;
 	struct cfq_io_context *cic;
@@ -1447,61 +1513,17 @@ cfq_get_io_context(struct cfq_data *cfqd
 	if (!ioc)
 		return NULL;
 
-	if ((cic = ioc->cic) == NULL) {
-		cic = cfq_alloc_io_context(cfqd, gfp_mask);
-
-		if (cic == NULL)
-			goto err;
-
-		/*
-		 * manually increment generic io_context usage count, it
-		 * cannot go away since we are already holding one ref to it
-		 */
-		ioc->cic = cic;
-		ioc->set_ioprio = cfq_ioc_set_ioprio;
-		cic->ioc = ioc;
-		cic->key = cfqd;
-		atomic_inc(&cfqd->ref);
-	} else {
-		struct cfq_io_context *__cic;
-
-		/*
-		 * the first cic on the list is actually the head itself
-		 */
-		if (cic->key == cfqd)
-			goto out;
-
-		/*
-		 * cic exists, check if we already are there. linear search
-		 * should be ok here, the list will usually not be more than
-		 * 1 or a few entries long
-		 */
-		list_for_each_entry(__cic, &cic->list, list) {
-			/*
-			 * this process is already holding a reference to
-			 * this queue, so no need to get one more
-			 */
-			if (__cic->key == cfqd) {
-				cic = __cic;
-				goto out;
-			}
-		}
+	ioc->set_ioprio = cfq_ioc_set_ioprio;
 
-		/*
-		 * nope, process doesn't have a cic assoicated with this
-		 * cfqq yet. get a new one and add to list
-		 */
-		__cic = cfq_alloc_io_context(cfqd, gfp_mask);
-		if (__cic == NULL)
-			goto err;
+	cic = cfq_cic_rb_lookup(cfqd, ioc);
+	if (cic)
+		goto out;
 
-		__cic->ioc = ioc;
-		__cic->key = cfqd;
-		atomic_inc(&cfqd->ref);
-		list_add(&__cic->list, &cic->list);
-		cic = __cic;
-	}
+	cic = cfq_alloc_io_context(cfqd, gfp_mask);
+	if (cic == NULL)
+		goto err;
 
+	cfq_cic_rb_add(cfqd, ioc, cic);
 out:
 	return cic;
 err:
@@ -1534,7 +1556,33 @@ cfq_update_io_thinktime(struct cfq_data 
 	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
 }
 
-#define sample_valid(samples)	((samples) > 80)
+static void
+cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
+		       struct cfq_rq *crq)
+{
+	sector_t sdist;
+	u64 total;
+
+	if (cic->last_request_pos < crq->request->sector)
+		sdist = crq->request->sector - cic->last_request_pos;
+	else
+		sdist = cic->last_request_pos - crq->request->sector;
+
+	/*
+	 * Don't allow the seek distance to get too large from the
+	 * odd fragment, pagein, etc
+	 */
+	if (cic->seek_samples <= 60) /* second&third seek */
+		sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
+	else
+		sdist = min(sdist, (cic->seek_mean * 4)	+ 2*1024*64);
+
+	cic->seek_samples = (7*cic->seek_samples + 256) / 8;
+	cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
+	total = cic->seek_total + (cic->seek_samples/2);
+	do_div(total, cic->seek_samples);
+	cic->seek_mean = (sector_t)total;
+}
 
 /*
  * Disable idle window if the process thinks too long or seeks so much that
@@ -1608,7 +1656,7 @@ static void cfq_preempt_queue(struct cfq
 		cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
 
 	cfqq->slice_end = cfqq->slice_left + jiffies;
-	__cfq_slice_expired(cfqd, cfqq, 1);
+	__cfq_slice_expired(cfqd, cfqq, 1, 1);
 	__cfq_set_active_queue(cfqd, cfqq);
 }
 
@@ -1647,9 +1695,11 @@ cfq_crq_enqueued(struct cfq_data *cfqd, 
 	cic = crq->io_context;
 
 	cfq_update_io_thinktime(cfqd, cic);
+	cfq_update_io_seektime(cfqd, cic, crq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
 	cic->last_queue = jiffies;
+	cic->last_request_pos = crq->request->sector + crq->request->nr_sectors;
 
 	if (cfqq == cfqd->active_queue) {
 		/*
@@ -1716,7 +1766,7 @@ static void cfq_completed_request(reques
 			cfq_resort_rr_list(cfqq, 0);
 		}
 		if (cfq_cfqq_expired(cfqq)) {
-			__cfq_slice_expired(cfqd, cfqq, 0);
+			__cfq_slice_expired(cfqd, cfqq, 0, 1);
 			cfq_schedule_dispatch(cfqd);
 		}
 	}
@@ -1785,14 +1835,6 @@ static void cfq_prio_boost(struct cfq_qu
 		cfq_resort_rr_list(cfqq, 0);
 }
 
-static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
-{
-	if (rw == READ || process_sync(task))
-		return task->pid;
-
-	return CFQ_KEY_ASYNC;
-}
-
 static inline int
 __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		struct task_struct *task, int rw)
@@ -1924,21 +1966,22 @@ cfq_set_request(request_queue_t *q, stru
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	cic = cfq_get_io_context(cfqd, key, gfp_mask);
+	cic = cfq_get_io_context(cfqd, gfp_mask);
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
 	if (!cic)
 		goto queue_fail;
 
-	if (!cic->cfqq) {
+	cfqq = cic->cfqq;
+	if (!cfqq || cfqq->key != key) {
 		cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask);
 		if (!cfqq)
 			goto queue_fail;
 
-		cic->cfqq = cfqq;
-	} else
-		cfqq = cic->cfqq;
+		if (!cic->cfqq)
+			cic->cfqq = cfqq;
+	}
 
 	cfqq->allocated[rw]++;
 	cfq_clear_cfqq_must_alloc(cfqq);
@@ -2124,7 +2167,6 @@ static int cfq_init_queue(request_queue_
 	for (i = 0; i < CFQ_PRIO_LISTS; i++)
 		INIT_LIST_HEAD(&cfqd->rr_list[i]);
 
-	INIT_LIST_HEAD(&cfqd->busy_rr);
 	INIT_LIST_HEAD(&cfqd->cur_rr);
 	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->empty_list);
diff --git a/block/elevator.c b/block/elevator.c
index 6c3fc8a..1812e38 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -648,12 +648,18 @@ void elv_unregister(struct elevator_type
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		struct io_context *ioc = p->io_context;
-		if (ioc && ioc->cic) {
-			ioc->cic->exit(ioc->cic);
-			ioc->cic->dtor(ioc->cic);
-			ioc->cic = NULL;
+		struct cfq_io_context *cic;
+
+		if (!ioc)
+			continue;
+
+		if (ioc->cic_root.rb_node != NULL) {
+			cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
+			cic->exit(ioc);
+			cic->dtor(ioc);
 		}
-		if (ioc && ioc->aic) {
+
+		if (ioc->aic) {
 			ioc->aic->exit(ioc->aic);
 			ioc->aic->dtor(ioc->aic);
 			ioc->aic = NULL;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 99c9ca6..f760a18 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3254,10 +3254,15 @@ void put_io_context(struct io_context *i
 	BUG_ON(atomic_read(&ioc->refcount) == 0);
 
 	if (atomic_dec_and_test(&ioc->refcount)) {
+		struct cfq_io_context *cic;
+
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
-		if (ioc->cic && ioc->cic->dtor)
-			ioc->cic->dtor(ioc->cic);
+
+		if (ioc->cic_root.rb_node != NULL) {
+			cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
+			cic->dtor(ioc);
+		}
 
 		kmem_cache_free(iocontext_cachep, ioc);
 	}
@@ -3269,6 +3274,7 @@ void exit_io_context(void)
 {
 	unsigned long flags;
 	struct io_context *ioc;
+	struct cfq_io_context *cic;
 
 	local_irq_save(flags);
 	task_lock(current);
@@ -3280,8 +3286,11 @@ void exit_io_context(void)
 
 	if (ioc->aic && ioc->aic->exit)
 		ioc->aic->exit(ioc->aic);
-	if (ioc->cic && ioc->cic->exit)
-		ioc->cic->exit(ioc->cic);
+
+	if (ioc->cic_root.rb_node != NULL) {
+		cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
+		cic->exit(ioc);
+	}
 
 	put_io_context(ioc);
 }
@@ -3311,7 +3320,8 @@ struct io_context *current_io_context(gf
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
-		ret->cic = NULL;
+		ret->cic_root.rb_node = NULL;
+
 		tsk->io_context = ret;
 	}
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a33a31e..07fd67f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -54,23 +54,27 @@ struct as_io_context {
 
 struct cfq_queue;
 struct cfq_io_context {
-	/*
-	 * circular list of cfq_io_contexts belonging to a process io context
-	 */
-	struct list_head list;
-	struct cfq_queue *cfqq;
+	struct rb_node rb_node;
 	void *key;
 
+	struct cfq_queue *cfqq;
+
 	struct io_context *ioc;
 
 	unsigned long last_end_request;
-	unsigned long last_queue;
+	sector_t last_request_pos;
+ 	unsigned long last_queue;
+
 	unsigned long ttime_total;
 	unsigned long ttime_samples;
 	unsigned long ttime_mean;
 
-	void (*dtor)(struct cfq_io_context *);
-	void (*exit)(struct cfq_io_context *);
+	unsigned int seek_samples;
+	u64 seek_total;
+	sector_t seek_mean;
+
+	void (*dtor)(struct io_context *); /* destructor */
+	void (*exit)(struct io_context *); /* called on task exit */
 };
 
 /*
@@ -91,7 +95,7 @@ struct io_context {
 	int nr_batch_requests;     /* Number of requests left in the batch */
 
 	struct as_io_context *aic;
-	struct cfq_io_context *cic;
+	struct rb_root cic_root;
 };
 
 void put_io_context(struct io_context *ioc);